สร้างผลิตภัณฑ์ไบนารีคาร์ทีเซียนที่ถูกกรอง

คำชี้แจงปัญหา

ฉันกำลังมองหาวิธีที่มีประสิทธิภาพในการสร้างผลิตภัณฑ์ไบนารีคาร์ทีเซียนเต็มรูปแบบ (ตารางที่มีทั้งชุดของจริงและเท็จพร้อมคอลัมน์จำนวนหนึ่ง) กรองโดยเงื่อนไขพิเศษบางอย่าง ตัวอย่างเช่นสำหรับสามคอลัมน์ / บิตn=3เราจะได้รับตารางเต็ม

df_combs = pd.DataFrame(itertools.product(*([[True, False]] * n)))
       0      1      2
0   True   True   True
1   True   True  False
2   True  False   True
3   True  False  False
...

สิ่งนี้ควรถูกกรองโดยพจนานุกรมที่กำหนดชุดค่าผสมพิเศษที่ไม่เกิดร่วมกันดังนี้:

mutually_excl = [{0: False, 1: False, 2: True},
                 {0: True, 2: True}]

ตำแหน่งที่คีย์แสดงถึงคอลัมน์ในตารางด้านบน ตัวอย่างจะถูกอ่านเป็น:

ถ้า 0 เป็นเท็จและ 1 เป็นเท็จ 2 ไม่เป็นจริง
ถ้า 0 เป็นจริง 2 ไม่เป็นจริง

ขึ้นอยู่กับตัวกรองเหล่านี้ผลลัพธ์ที่ต้องการคือ:

       0      1      2
1   True   True  False
3   True  False  False
4  False   True   True
5  False   True  False
7  False  False  False

ในกรณีการใช้งานของฉันตารางที่ถูกกรองคือคำสั่งหลายขนาดที่เล็กกว่าผลิตภัณฑ์คาร์ทีเซียนเต็มรูปแบบ (เช่นบาง 1,000 แทน2**24 (16777216))

ด้านล่างนี้เป็นคำตอบสามข้อในปัจจุบันของฉันซึ่งแต่ละข้อล้วนมีข้อดีและข้อเสียแตกต่างกันไป

import random
import pandas as pd
import itertools
import wrapt
import time
import operator
import functools

def get_mutually_excl(n, nfilt):  # generate random example filter
    ''' Example: `get_mutually_excl(9, 2)` creates a list of two filters with
    maximum index `n=9` and each filter length between 2 and `int(n/3)`:
    `[{1: True, 2: False}, {3: False, 2: True, 6: False}]` '''
    random.seed(2)
    return [{random.choice(range(n)): random.choice([True, False])
                           for _ in range(random.randint(2, int(n/3)))}
                           for _ in range(nfilt)]

@wrapt.decorator
def timediff(f, _, args, kwargs):
    t = time.perf_counter()
    res = f(*args)
    return res, time.perf_counter() - t

โซลูชันที่ 1: ตัวกรองก่อนแล้วจึงผสาน

ขยายแต่ละรายการตัวกรองเดียว (เช่น{0: True, 2: True}) ลงในตารางย่อยที่มีคอลัมน์ที่สอดคล้องกับดัชนีในรายการตัวกรองนี้ ( [0, 2]) ลบแถวที่กรองเดียวออกจากตารางย่อยนี้ ( [True, True]) รวมกับตารางเต็มเพื่อรับรายการชุดค่าผสมที่กรองทั้งหมด

@timediff
def make_df_comb_filt_merge(n, nfilt):

    mutually_excl = get_mutually_excl(n, nfilt)

    # determine missing (unfiltered) columns
    cols_missing = set(range(n)) - set(itertools.chain.from_iterable(mutually_excl))

    # complete dataframe of unfiltered columns with column "temp" for full outer merge
    df_comb = pd.DataFrame(itertools.product(*([[True, False]] * len(cols_missing))),
                            columns=cols_missing).assign(temp=1)

    for filt in mutually_excl:  # loop through individual filters

        # get columns and bool values of this filters as two tuples with same order
        list_col, list_bool = zip(*filt.items())

        # construct dataframe
        df = pd.DataFrame(itertools.product(*([[True, False]] * len(list_col))),
                                columns=list_col)

        # filter remove a *single* row (by definition)
        df = df.loc[df.apply(tuple, axis=1) != list_bool]

        # determine which rows to merge on
        merge_cols = list(set(df.columns) & set(df_comb.columns))
        if not merge_cols:
            merge_cols = ['temp']
            df['temp'] = 1

        # merge with full dataframe
        df_comb = pd.merge(df_comb, df, on=merge_cols)

    df_comb.drop('temp', axis=1, inplace=True)
    df_comb = df_comb[range(n)]
    df_comb = df_comb.sort_values(df_comb.columns.tolist(), ascending=False)

    return df_comb.reset_index(drop=True)

โซลูชันที่ 2: การขยายตัวเต็มแล้วกรอง

สร้าง DataFrame สำหรับผลิตภัณฑ์คาร์ทีเซียนเต็มรูปแบบ: สิ่งทั้งหมดจบลงในหน่วยความจำ วนรอบตัวกรองและสร้างมาสก์สำหรับแต่ละรายการ ใช้หน้ากากแต่ละตัวกับตาราง


@timediff
def make_df_comb_exp_filt(n, nfilt):

    mutually_excl = get_mutually_excl(n, nfilt)

    # expand all bool combinations into dataframe
    df_comb = pd.DataFrame(itertools.product(*([[True, False]] * n)),
                           dtype=bool)

    for filt in mutually_excl:

        # generate total filter mask for given excluded combination
        mask = pd.Series(True, index=df_comb.index)
        for col, bool_act in filt.items():
            mask = mask & (df_comb[col] == bool_act)

        # filter dataframe
        df_comb = df_comb.loc[~mask]

    return df_comb.reset_index(drop=True)

โซลูชันที่ 3: ตัวกรองตัววนซ้ำ

ทำให้สินค้าคาร์ทีเซียนเต็มเป็นตัววนซ้ำ วนซ้ำในขณะที่ตรวจสอบแต่ละแถวว่าตัวกรองใด ๆ ถูกแยกออกหรือไม่

@timediff
def make_df_iter_filt(n, nfilt):

    mutually_excl = get_mutually_excl(n, nfilt)

    # switch to [[(1, 13), (True, False)], [(4, 9), (False, True)], ...]
    mutually_excl_index = [list(zip(*comb.items()))
                                for comb in mutually_excl]

    # create iterator
    combs_iter = itertools.product(*([[True, False]] * n))

    @functools.lru_cache(maxsize=1024, typed=True)  # small benefit
    def get_getter(list_):
        # Used to access combs_iter row values as indexed by the filter
        return operator.itemgetter(*list_)

    def check_comb(comb_inp, comb_check):
        return get_getter(comb_check[0])(comb_inp) == comb_check[1]

    # loop through the iterator
    # drop row if any of the filter matches
    df_comb = pd.DataFrame([comb_inp for comb_inp in combs_iter
                       if not any(check_comb(comb_inp, comb_check)
                                  for comb_check in mutually_excl_index)])

    return df_comb.reset_index(drop=True)

เรียกใช้ตัวอย่าง

dict_time = dict.fromkeys(itertools.product(range(16, 23, 2), range(3, 20)))

for n, nfilt in dict_time:
    dict_time[(n, nfilt)] = {'exp_filt': make_df_comb_exp_filt(n, nfilt)[1],
                             'filt_merge': make_df_comb_filt_merge(n, nfilt)[1],
                             'iter_filt': make_df_iter_filt(n, nfilt)[1]}

การวิเคราะห์

import seaborn as sns
import matplotlib.pyplot as plt

df_time = pd.DataFrame.from_dict(dict_time, orient='index',
                                 ).rename_axis(["n", "nfilt"]
                                 ).stack().reset_index().rename(columns={'level_2': 'solution', 0: 'time'})

g = sns.FacetGrid(df_time.query('n in %s' % str([16,18,20,22])),
                  col="n",  hue="solution", sharey=False)
g = (g.map(plt.plot, "nfilt", "time", marker="o").add_legend())

โซลูชันที่ 3 : วิธีการตามตัววนซ้ำ ( comb_iterator) มีช่วงเวลาทำงานที่น่าหดหู่ แต่ไม่มีการใช้หน่วยความจำอย่างมีนัยสำคัญ ฉันรู้สึกว่ามีห้องพักสำหรับการปรับปรุงแม้ว่าการวนรอบอย่างหลีกเลี่ยงไม่ได้มีแนวโน้มที่จะกำหนดขอบเขตอย่างหนักในแง่ของเวลาทำงาน

โซลูชันที่ 2 : การขยายผลิตภัณฑ์คาร์ทีเซียนแบบเต็มไปยัง DataFrame ( exp_filt) ทำให้เกิดหน่วยความจำแหลมที่สำคัญซึ่งฉันต้องการหลีกเลี่ยง เวลาทำงานก็โอเค

โซลูชันที่ 1 : การผสาน DataFrames ที่สร้างจากตัวกรองแต่ละตัว ( filt_merge) ให้ความรู้สึกเหมือนเป็นทางออกที่ดีสำหรับการใช้งานจริงของฉัน (สังเกตการลดเวลาทำงานสำหรับตัวกรองจำนวนมากซึ่งเป็นผลมาจากcols_missingตารางที่เล็กลง) ยังคงใช้วิธีนี้ไม่ได้ทั้งความพึงพอใจ: ถ้าตัวกรองเดียวมีคอลัมน์ที่ทุกผลิตภัณฑ์ Cartesian ทั้งหมด ( 2**n) comb_iteratorจะจบลงในหน่วยความจำทำให้การแก้ปัญหานี้เลวร้ายยิ่งกว่า

คำถาม: ความคิดอื่น ๆ ? สมาร์ทสองซับบ้าบ้า? วิธีการตามตัววนซ้ำสามารถปรับปรุงได้หรือไม่?

python pandas dataframe

— mcsoini
แหล่งที่มา

ตัวแก้ไขข้อ จำกัด น่าจะมีประสิทธิภาพสูงกว่าวิธีการเหล่านี้เพราะพวกเขาค้นหาวิธีแก้ปัญหาเหล่านี้โดยการลดพื้นที่การค้นหา อาจจะดูหรือเครื่องมือ นี่คือตัวอย่างสำหรับ SAT

— ayhan

@ ใช่ฉันลอง (ดูคำตอบ) มันเป็นวิธีการที่น่าสนใจ แต่ไม่เหมาะกับการแก้ปัญหาทั่วไป ขอบคุณสำหรับการป้อนข้อมูล ฉันได้เรียนรู้บางสิ่งบางอย่าง :)

— mcsoini

ใช่เสียงเหมือนปัญหาSATดังนั้นคุณควรใช้ตัวแก้ปัญหาหากปัญหามีขนาดใหญ่พอ คุณสามารถลองor.stackexchange.com

— Stradivari

สูตร @Stradivari เป็นปัญหา SAT แน่นอนทำให้รู้สึก ฉันไม่ชอบการพึ่งพาตัวกรองจำนวนมากของวิธีการนี้ อาจเป็นได้ว่าฉันไม่สามารถเข้าถึงโซลูชันได้อย่างถูกต้อง เนื่องจากคุณรู้วิธีการของคุณรอบ ๆ หรือเครื่องมือคุณอาจต้องการดูคำถามที่เกี่ยวข้องของฉัน... มันยังขาดคำตอบที่ยอมรับ;)

— 24919

ลองกำหนดเวลาดังต่อไปนี้:

def in_filter(arr, arr_filt, n):
    return ((arr[:, None] >> (n-1-arr_filt[:, 0])) & 1 == arr_filt[:, 1]).all(axis=1)

def bits_to_boolean(arr, n):
    return ((arr[:, None] >> np.arange(n, dtype=arr.dtype)[::-1]) & 1).astype(bool)

@timediff
def recursive_filter(n, nfilt, dtype='uint32'):
    filts = get_mutually_excl(n, nfilt)
    out = np.arange(2**n, dtype=dtype)
    for filt in filts:
        arr_filt = np.array(list(filt.items()))
        out = out[~in_filter(out, arr_filt, n)]
    return bits_to_boolean(out, n)[::-1]

มันปฏิบัติต่อผลิตภัณฑ์ไบนารีคาร์ทีเซียนเป็นบิตที่เข้ารหัสในช่วงของจำนวนเต็ม0..<2**nและใช้ฟังก์ชั่นเวกเตอร์เพื่อเอาหมายเลขซ้ำที่มีลำดับบิตที่ตรงกับตัวกรองที่กำหนด

ประสิทธิภาพของหน่วยความจำดีกว่าการจัดสรร[True, False]ผลิตภัณฑ์คาร์ทีเซียนทั้งหมดเนื่องจากบูลีนแต่ละอันจะถูกเก็บไว้อย่างน้อย 8 บิตในแต่ละครั้ง (ใช้มากกว่า 7 บิตที่จำเป็น) แต่จะใช้หน่วยความจำมากกว่าวิธีที่ใช้ตัววนซ้ำ หากคุณต้องการวิธีแก้ปัญหาสำหรับขนาดใหญ่nคุณสามารถแยกย่อยงานนี้ได้โดยการจัดสรรและดำเนินการในแต่ละช่วงย่อยทีละครั้ง ฉันมีสิ่งนี้ในการใช้งานครั้งแรกของฉัน แต่มันไม่ได้ให้ประโยชน์มากนักn<=22และต้องใช้การคำนวณขนาดของอาร์เรย์เอาต์พุตซึ่งซับซ้อนเมื่อมีตัวกรองซ้อนทับกัน

— Seb
แหล่งที่มา

นี่ช่างยอดเยี่ยมจริงๆ!

— mcsoini

จากความคิดเห็นของ @ ayhan ฉันได้ติดตั้งโซลูชันบนพื้นฐานของ SAT ในขณะที่ความคิดนั้นยอดเยี่ยมสิ่งนี้จะต่อสู้กับตัวแปรไบนารีจำนวนมากขึ้น ฉันสงสัยว่านี่คล้ายกับปัญหา IP ขนาดใหญ่ซึ่งไม่เดินในสวนเช่นกัน อย่างไรก็ตามการพึ่งพาหมายเลขตัวกรองที่แข็งแกร่งสามารถทำให้ตัวเลือกนี้เป็นตัวเลือกที่ถูกต้องสำหรับการกำหนดค่าพารามิเตอร์บางอย่าง แต่เป็นวิธีแก้ปัญหาทั่วไปฉันจะไม่ใช้มัน

from ortools.sat.python import cp_model

class VarArraySolutionCollector(cp_model.CpSolverSolutionCallback):

    def __init__(self, variables):
        cp_model.CpSolverSolutionCallback.__init__(self)
        self.__variables = variables
        self.solution_list = []

    def on_solution_callback(self):
        self.solution_list.append([self.Value(v) for v in self.__variables])


@timediff
def make_df_comb_sat(n, nfilt):

    mutually_excl = get_mutually_excl(n, nfilt)

    model = cp_model.CpModel()

    make_var_name = 'x{:02d}'.format
    vrs = dict.fromkeys(map(make_var_name, range(n)))
    for var_name in vrs:
        vrs[var_name] = model.NewBoolVar(var_name)

    for filt in mutually_excl:
        list_expr = [vrs[make_var_name(iv)]
                     if not bool_ else getattr(vrs[make_var_name(iv)], 'Not')()
                     for iv, bool_ in filt.items()]
        model.AddBoolOr(list_expr)

    solver = cp_model.CpSolver()
    solution_printer = VarArraySolutionCollector(vrs.values())
    solver.SearchForAllSolutions(model, solution_printer)

    df_comb = pd.DataFrame(solution_printer.solution_list).astype(bool)
    df_comb = df_comb.sort_values(df_comb.columns.tolist(), ascending=False)
    df_comb = df_comb.reset_index(drop=True)

    return df_comb

— mcsoini
แหล่งที่มา