Skip to content

Module omnipy.modules.pandas.serializers

Overview

View Source
from io import BytesIO

from typing import Any, IO, Type

from omnipy.data.serializer import TarFileSerializer

from . import pd

from ...data.dataset import Dataset

from .models import PandasDataset

class PandasDatasetToTarFileSerializer(TarFileSerializer):

    """"""

    @classmethod

    def is_dataset_directly_supported(cls, dataset: Dataset) -> bool:

        return isinstance(dataset, PandasDataset)

    @classmethod

    def get_dataset_cls_for_new(cls) -> Type[Dataset]:

        return PandasDataset

    @classmethod

    def get_output_file_suffix(cls) -> str:

        return 'csv'

    @classmethod

    def serialize(cls, pandas_dataset: PandasDataset) -> bytes | memoryview:

        assert isinstance(pandas_dataset, PandasDataset)

        def pandas_encode_func(pandas_data: pd.DataFrame) -> memoryview:

            csv_bytes = BytesIO()

            pandas_data.to_csv(csv_bytes, encoding='utf8', mode='b', index=False)

            return csv_bytes.getbuffer()

        return cls.create_tarfile_from_dataset(pandas_dataset, data_encode_func=pandas_encode_func)

    @classmethod

    def deserialize(cls, tarfile_bytes: bytes) -> PandasDataset:

        pandas_dataset = PandasDataset()

        def csv_decode_func(file_stream: IO[bytes]) -> pd.DataFrame:

            return pd.read_csv(file_stream, encoding='utf8')

        def python_dictify_object(obj_type: str, obj_val: Any) -> dict:

            return {obj_type: obj_val}

        cls.create_dataset_from_tarfile(

            pandas_dataset,

            tarfile_bytes,

            data_decode_func=csv_decode_func,

            dictify_object_func=python_dictify_object,

            import_method='from_data')  # noqa

        return pandas_dataset

Classes

PandasDatasetToTarFileSerializer

class PandasDatasetToTarFileSerializer(
    /,
    *args,
    **kwargs
)
View Source
class PandasDatasetToTarFileSerializer(TarFileSerializer):

    """"""

    @classmethod

    def is_dataset_directly_supported(cls, dataset: Dataset) -> bool:

        return isinstance(dataset, PandasDataset)

    @classmethod

    def get_dataset_cls_for_new(cls) -> Type[Dataset]:

        return PandasDataset

    @classmethod

    def get_output_file_suffix(cls) -> str:

        return 'csv'

    @classmethod

    def serialize(cls, pandas_dataset: PandasDataset) -> bytes | memoryview:

        assert isinstance(pandas_dataset, PandasDataset)

        def pandas_encode_func(pandas_data: pd.DataFrame) -> memoryview:

            csv_bytes = BytesIO()

            pandas_data.to_csv(csv_bytes, encoding='utf8', mode='b', index=False)

            return csv_bytes.getbuffer()

        return cls.create_tarfile_from_dataset(pandas_dataset, data_encode_func=pandas_encode_func)

    @classmethod

    def deserialize(cls, tarfile_bytes: bytes) -> PandasDataset:

        pandas_dataset = PandasDataset()

        def csv_decode_func(file_stream: IO[bytes]) -> pd.DataFrame:

            return pd.read_csv(file_stream, encoding='utf8')

        def python_dictify_object(obj_type: str, obj_val: Any) -> dict:

            return {obj_type: obj_val}

        cls.create_dataset_from_tarfile(

            pandas_dataset,

            tarfile_bytes,

            data_decode_func=csv_decode_func,

            dictify_object_func=python_dictify_object,

            import_method='from_data')  # noqa

        return pandas_dataset

Static methods

create_dataset_from_tarfile
def create_dataset_from_tarfile(
    dataset: omnipy.data.dataset.Dataset,
    tarfile_bytes: bytes,
    data_decode_func: Callable[[IO[bytes]], Any],
    dictify_object_func: Callable[[str, Any], dict | str],
    import_method='from_data'
)

Parameters:

Name Type Description Default
dataset Dataset
tarfile_bytes bytes
data_decode_func Callable[[IO[bytes]], Any]
dictify_object_func Callable[[str, Any], dict str]
import_method from_data
View Source
    @classmethod

    def create_dataset_from_tarfile(cls,

                                    dataset: Dataset,

                                    tarfile_bytes: bytes,

                                    data_decode_func: Callable[[IO[bytes]], Any],

                                    dictify_object_func: Callable[[str, Any], dict | str],

                                    import_method='from_data'):

        with tarfile.open(fileobj=BytesIO(tarfile_bytes), mode='r:gz') as tarfile_stream:

            for filename in tarfile_stream.getnames():

                obj_type_file = tarfile_stream.extractfile(filename)

                assert filename.endswith(f'.{cls.get_output_file_suffix()}')

                obj_type = '.'.join(filename.split('.')[:-1])

                getattr(dataset, import_method)(

                    dictify_object_func(obj_type, data_decode_func(obj_type_file)))
create_tarfile_from_dataset
def create_tarfile_from_dataset(
    dataset: omnipy.data.dataset.Dataset,
    data_encode_func: Callable[[Any], bytes | memoryview]
)

Parameters:

Name Type Description Default
dataset Dataset
data_encode_func Callable[[Any], bytes memoryview]
View Source
    @classmethod

    def create_tarfile_from_dataset(cls,

                                    dataset: Dataset,

                                    data_encode_func: Callable[[Any], bytes | memoryview]):

        bytes_io = BytesIO()

        with tarfile.open(fileobj=bytes_io, mode='w:gz') as tarfile_stream:

            for obj_type, data_obj in dataset.items():

                json_data_bytestream = BytesIO(data_encode_func(data_obj))

                json_data_bytestream.seek(0)

                tarinfo = TarInfo(name=f'{obj_type}.{cls.get_output_file_suffix()}')

                tarinfo.size = len(json_data_bytestream.getbuffer())

                tarfile_stream.addfile(tarinfo, json_data_bytestream)

        return bytes_io.getbuffer().tobytes()
deserialize
def deserialize(
    tarfile_bytes: bytes
) -> omnipy.modules.pandas.models.PandasDataset

Parameters:

Name Type Description Default
tarfile_bytes bytes

Returns:

Type Description
PandasDataset
View Source
    @classmethod

    def deserialize(cls, tarfile_bytes: bytes) -> PandasDataset:

        pandas_dataset = PandasDataset()

        def csv_decode_func(file_stream: IO[bytes]) -> pd.DataFrame:

            return pd.read_csv(file_stream, encoding='utf8')

        def python_dictify_object(obj_type: str, obj_val: Any) -> dict:

            return {obj_type: obj_val}

        cls.create_dataset_from_tarfile(

            pandas_dataset,

            tarfile_bytes,

            data_decode_func=csv_decode_func,

            dictify_object_func=python_dictify_object,

            import_method='from_data')  # noqa

        return pandas_dataset
get_dataset_cls_for_new
def get_dataset_cls_for_new(

) -> Type[omnipy.data.dataset.Dataset]

Returns:

Type Description
Type[Dataset]
View Source
    @classmethod

    def get_dataset_cls_for_new(cls) -> Type[Dataset]:

        return PandasDataset
get_output_file_suffix
def get_output_file_suffix(

) -> str

Returns:

Type Description
str
View Source
    @classmethod

    def get_output_file_suffix(cls) -> str:

        return 'csv'
is_dataset_directly_supported
def is_dataset_directly_supported(
    dataset: omnipy.data.dataset.Dataset
) -> bool

Parameters:

Name Type Description Default
dataset Dataset

Returns:

Type Description
bool
View Source
    @classmethod

    def is_dataset_directly_supported(cls, dataset: Dataset) -> bool:

        return isinstance(dataset, PandasDataset)
serialize
def serialize(
    pandas_dataset: omnipy.modules.pandas.models.PandasDataset
) -> bytes | memoryview

Parameters:

Name Type Description Default
pandas_dataset PandasDataset

Returns:

Type Description
bytes memoryview
View Source
    @classmethod

    def serialize(cls, pandas_dataset: PandasDataset) -> bytes | memoryview:

        assert isinstance(pandas_dataset, PandasDataset)

        def pandas_encode_func(pandas_data: pd.DataFrame) -> memoryview:

            csv_bytes = BytesIO()

            pandas_data.to_csv(csv_bytes, encoding='utf8', mode='b', index=False)

            return csv_bytes.getbuffer()

        return cls.create_tarfile_from_dataset(pandas_dataset, data_encode_func=pandas_encode_func)