Setter คุณสมบัติสำหรับคลาสย่อยของ Pandas DataFrame

ฉันกำลังพยายามตั้งค่าคลาสย่อยของpd.DataFrameที่มีสองข้อโต้แย้งที่จำเป็นเมื่อเริ่มต้น ( groupและtimestamp_col) ฉันต้องการเรียกใช้การตรวจสอบความถูกต้องของข้อโต้แย้งเหล่านั้นgroupและtimestamp_colดังนั้นฉันจึงมีเมธอด setter สำหรับแต่ละคุณสมบัติ งานนี้ทั้งหมดจนกว่าฉันพยายามที่จะและได้รับset_index() TypeError: 'NoneType' object is not iterableแต่ดูเหมือนว่ามันไม่มีค่าจะถูกส่งผ่านไปยังฟังก์ชั่นตั้งค่าของฉันในและtest_set_index test_assignment_with_indexed_objถ้าฉันเพิ่มif g == None: returnฟังก์ชัน setter ของฉันฉันสามารถผ่านกรณีทดสอบ แต่ไม่คิดว่าเป็นทางออกที่เหมาะสม

ฉันจะใช้การตรวจสอบคุณสมบัติสำหรับอาร์กิวเมนต์ที่ต้องการเหล่านี้ได้อย่างไร

ด้านล่างเป็นชั้นเรียนของฉัน:

import pandas as pd
import numpy as np


class HistDollarGains(pd.DataFrame):
    @property
    def _constructor(self):
        return HistDollarGains._internal_ctor

    _metadata = ["group", "timestamp_col", "_group", "_timestamp_col"]

    @classmethod
    def _internal_ctor(cls, *args, **kwargs):
        kwargs["group"] = None
        kwargs["timestamp_col"] = None
        return cls(*args, **kwargs)

    def __init__(
        self,
        data,
        group,
        timestamp_col,
        index=None,
        columns=None,
        dtype=None,
        copy=True,
    ):
        super(HistDollarGains, self).__init__(
            data=data, index=index, columns=columns, dtype=dtype, copy=copy
        )

        self.group = group
        self.timestamp_col = timestamp_col

    @property
    def group(self):
        return self._group

    @group.setter
    def group(self, g):
        if g == None:
            return

        if isinstance(g, str):
            group_list = [g]
        else:
            group_list = g

        if not set(group_list).issubset(self.columns):
            raise ValueError("Data does not contain " + '[' + ', '.join(group_list) + ']')
        self._group = group_list

    @property
    def timestamp_col(self):
        return self._timestamp_col

    @timestamp_col.setter
    def timestamp_col(self, t):
        if t == None:
            return
        if not t in self.columns:
            raise ValueError("Data does not contain " + '[' + t + ']')
        self._timestamp_col = t

นี่คือกรณีทดสอบของฉัน:

import pytest

import pandas as pd
import numpy as np

from myclass import *


@pytest.fixture(scope="module")
def sample():
    samp = pd.DataFrame(
        [
            {"timestamp": "2020-01-01", "group": "a", "dollar_gains": 100},
            {"timestamp": "2020-01-01", "group": "b", "dollar_gains": 100},
            {"timestamp": "2020-01-01", "group": "c", "dollar_gains": 110},
            {"timestamp": "2020-01-01", "group": "a", "dollar_gains": 110},
            {"timestamp": "2020-01-01", "group": "b", "dollar_gains": 90},
            {"timestamp": "2020-01-01", "group": "d", "dollar_gains": 100},
        ]
    )

    return samp

@pytest.fixture(scope="module")
def sample_obj(sample):
    return HistDollarGains(sample, "group", "timestamp")

def test_constructor_without_args(sample):
    with pytest.raises(TypeError):
        HistDollarGains(sample)


def test_constructor_with_string_group(sample):
    hist_dg = HistDollarGains(sample, "group", "timestamp")
    assert hist_dg.group == ["group"]
    assert hist_dg.timestamp_col == "timestamp"


def test_constructor_with_list_group(sample):
    hist_dg = HistDollarGains(sample, ["group", "timestamp"], "timestamp")

def test_constructor_with_invalid_group(sample):
    with pytest.raises(ValueError):
        HistDollarGains(sample, "invalid_group", np.random.choice(sample.columns))

def test_constructor_with_invalid_timestamp(sample):
    with pytest.raises(ValueError):
        HistDollarGains(sample, np.random.choice(sample.columns), "invalid_timestamp")

def test_assignment_with_indexed_obj(sample_obj):
    b = sample_obj.set_index(sample_obj.group + [sample_obj.timestamp_col])

def test_set_index(sample_obj):
    # print(isinstance(a, pd.DataFrame))
    assert sample_obj.set_index(sample_obj.group + [sample_obj.timestamp_col]).index.names == ['group', 'timestamp']

— cpage
แหล่งที่มา

หากNoneเป็นค่าที่ไม่ถูกต้องสำหรับgroupคุณสมบัติคุณไม่ควรเพิ่มValueErrorหรือไม่

— chepner

คุณพูดถูกที่Noneเป็นค่าที่ไม่ถูกต้องซึ่งเป็นสาเหตุที่ฉันไม่ชอบคำสั่ง if แต่การเพิ่มว่าไม่มีทำให้ผ่านการทดสอบ ฉันกำลังมองหาวิธีการแก้ไขอย่างถูกต้องโดยไม่มีคำสั่ง None ถ้า

— หน้า

ValueErrorหมาควรยก ปัญหาคือการหาสิ่งที่พยายามตั้งค่าgroupคุณลักษณะNoneในสถานที่แรก

— chepner

@chepner ใช่แน่นอน

— หน้า

บางทีแพ็คเกจแพนดาสรสอาจช่วยได้

— Mykola Zotko

set_index()วิธีการจะเรียกself.copy()ภายในเพื่อสร้างสำเนาของวัตถุ DataFrame ของคุณ (ดูรหัสที่มาที่นี่ ) ภายในซึ่งจะใช้วิธีการคอนสตรัคของคุณเอง_internal_ctor()เพื่อสร้างวัตถุใหม่ ( แหล่งที่มา ) โปรดทราบว่าself._constructor()เหมือนกันself._internal_ctor()ซึ่งเป็นวิธีการภายในทั่วไปสำหรับคลาสแพนด้าเกือบทั้งหมดสำหรับการสร้างอินสแตนซ์ใหม่ระหว่างการดำเนินการเช่นการคัดลอกลึกหรือการแบ่งส่วน ปัญหาของคุณมาจากฟังก์ชั่นนี้จริงๆ:

class HistDollarGains(pd.DataFrame):
    ...
    @classmethod
    def _internal_ctor(cls, *args, **kwargs):
        kwargs["group"]         = None
        kwargs["timestamp_col"] = None
        return cls(*args, **kwargs) # this is equivalent to calling
                                    # HistDollarGains(data, group=None, timestamp_col=None)

ผมคิดว่าคุณคัดลอกโค้ดนี้จากปัญหา GitHub เส้นkwargs["**"] = Noneอย่างชัดเจนบอกสร้างชุดNoneทั้งสองและgroup timestamp_colในที่สุด setter / validator จะได้รับNoneเป็นค่าใหม่และเพิ่มข้อผิดพลาด

ดังนั้นคุณควรตั้งค่าได้รับการยอมรับและgrouptimestamp_col

    @classmethod
    def _internal_ctor(cls, *args, **kwargs):
        kwargs["group"]         = []
        kwargs["timestamp_col"] = 'timestamp' # or whatever name that makes your validator happy
        return cls(*args, **kwargs)

จากนั้นคุณสามารถลบif g == None: returnบรรทัดในเครื่องมือตรวจสอบ

— gdlmx
แหล่งที่มา