Module omnipy.modules.pandas.serializers
Overview
View Source
from io import BytesIO
from typing import Any, IO, Type
from omnipy.data.serializer import TarFileSerializer
from . import pd
from ...data.dataset import Dataset
from .models import PandasDataset
class PandasDatasetToTarFileSerializer(TarFileSerializer):
""""""
@classmethod
def is_dataset_directly_supported(cls, dataset: Dataset) -> bool:
return isinstance(dataset, PandasDataset)
@classmethod
def get_dataset_cls_for_new(cls) -> Type[Dataset]:
return PandasDataset
@classmethod
def get_output_file_suffix(cls) -> str:
return 'csv'
@classmethod
def serialize(cls, pandas_dataset: PandasDataset) -> bytes | memoryview:
assert isinstance(pandas_dataset, PandasDataset)
def pandas_encode_func(pandas_data: pd.DataFrame) -> memoryview:
csv_bytes = BytesIO()
pandas_data.to_csv(csv_bytes, encoding='utf8', mode='b', index=False)
return csv_bytes.getbuffer()
return cls.create_tarfile_from_dataset(pandas_dataset, data_encode_func=pandas_encode_func)
@classmethod
def deserialize(cls, tarfile_bytes: bytes) -> PandasDataset:
pandas_dataset = PandasDataset()
def csv_decode_func(file_stream: IO[bytes]) -> pd.DataFrame:
return pd.read_csv(file_stream, encoding='utf8')
def python_dictify_object(obj_type: str, obj_val: Any) -> dict:
return {obj_type: obj_val}
cls.create_dataset_from_tarfile(
pandas_dataset,
tarfile_bytes,
data_decode_func=csv_decode_func,
dictify_object_func=python_dictify_object,
import_method='from_data') # noqa
return pandas_dataset
Classes
PandasDatasetToTarFileSerializer
View Source
class PandasDatasetToTarFileSerializer(TarFileSerializer):
""""""
@classmethod
def is_dataset_directly_supported(cls, dataset: Dataset) -> bool:
return isinstance(dataset, PandasDataset)
@classmethod
def get_dataset_cls_for_new(cls) -> Type[Dataset]:
return PandasDataset
@classmethod
def get_output_file_suffix(cls) -> str:
return 'csv'
@classmethod
def serialize(cls, pandas_dataset: PandasDataset) -> bytes | memoryview:
assert isinstance(pandas_dataset, PandasDataset)
def pandas_encode_func(pandas_data: pd.DataFrame) -> memoryview:
csv_bytes = BytesIO()
pandas_data.to_csv(csv_bytes, encoding='utf8', mode='b', index=False)
return csv_bytes.getbuffer()
return cls.create_tarfile_from_dataset(pandas_dataset, data_encode_func=pandas_encode_func)
@classmethod
def deserialize(cls, tarfile_bytes: bytes) -> PandasDataset:
pandas_dataset = PandasDataset()
def csv_decode_func(file_stream: IO[bytes]) -> pd.DataFrame:
return pd.read_csv(file_stream, encoding='utf8')
def python_dictify_object(obj_type: str, obj_val: Any) -> dict:
return {obj_type: obj_val}
cls.create_dataset_from_tarfile(
pandas_dataset,
tarfile_bytes,
data_decode_func=csv_decode_func,
dictify_object_func=python_dictify_object,
import_method='from_data') # noqa
return pandas_dataset
Static methods
create_dataset_from_tarfile
def create_dataset_from_tarfile(
dataset: omnipy.data.dataset.Dataset,
tarfile_bytes: bytes,
data_decode_func: Callable[[IO[bytes]], Any],
dictify_object_func: Callable[[str, Any], dict | str],
import_method='from_data'
)
Parameters:
Name | Type | Description | Default |
---|---|---|---|
dataset |
Dataset |
||
tarfile_bytes |
bytes |
||
data_decode_func |
Callable[[IO[bytes]], Any] |
||
dictify_object_func |
Callable[[str, Any], dict |
str] | |
import_method |
from_data |
View Source
@classmethod
def create_dataset_from_tarfile(cls,
dataset: Dataset,
tarfile_bytes: bytes,
data_decode_func: Callable[[IO[bytes]], Any],
dictify_object_func: Callable[[str, Any], dict | str],
import_method='from_data'):
with tarfile.open(fileobj=BytesIO(tarfile_bytes), mode='r:gz') as tarfile_stream:
for filename in tarfile_stream.getnames():
obj_type_file = tarfile_stream.extractfile(filename)
assert filename.endswith(f'.{cls.get_output_file_suffix()}')
obj_type = '.'.join(filename.split('.')[:-1])
getattr(dataset, import_method)(
dictify_object_func(obj_type, data_decode_func(obj_type_file)))
create_tarfile_from_dataset
def create_tarfile_from_dataset(
dataset: omnipy.data.dataset.Dataset,
data_encode_func: Callable[[Any], bytes | memoryview]
)
Parameters:
Name | Type | Description | Default |
---|---|---|---|
dataset |
Dataset |
||
data_encode_func |
Callable[[Any], bytes |
memoryview] |
View Source
@classmethod
def create_tarfile_from_dataset(cls,
dataset: Dataset,
data_encode_func: Callable[[Any], bytes | memoryview]):
bytes_io = BytesIO()
with tarfile.open(fileobj=bytes_io, mode='w:gz') as tarfile_stream:
for obj_type, data_obj in dataset.items():
json_data_bytestream = BytesIO(data_encode_func(data_obj))
json_data_bytestream.seek(0)
tarinfo = TarInfo(name=f'{obj_type}.{cls.get_output_file_suffix()}')
tarinfo.size = len(json_data_bytestream.getbuffer())
tarfile_stream.addfile(tarinfo, json_data_bytestream)
return bytes_io.getbuffer().tobytes()
deserialize
Parameters:
Name | Type | Description | Default |
---|---|---|---|
tarfile_bytes |
bytes |
Returns:
Type | Description |
---|---|
PandasDataset |
View Source
@classmethod
def deserialize(cls, tarfile_bytes: bytes) -> PandasDataset:
pandas_dataset = PandasDataset()
def csv_decode_func(file_stream: IO[bytes]) -> pd.DataFrame:
return pd.read_csv(file_stream, encoding='utf8')
def python_dictify_object(obj_type: str, obj_val: Any) -> dict:
return {obj_type: obj_val}
cls.create_dataset_from_tarfile(
pandas_dataset,
tarfile_bytes,
data_decode_func=csv_decode_func,
dictify_object_func=python_dictify_object,
import_method='from_data') # noqa
return pandas_dataset
get_dataset_cls_for_new
Returns:
Type | Description |
---|---|
Type[Dataset] |
View Source
@classmethod
def get_dataset_cls_for_new(cls) -> Type[Dataset]:
return PandasDataset
get_output_file_suffix
Returns:
Type | Description |
---|---|
str |
View Source
@classmethod
def get_output_file_suffix(cls) -> str:
return 'csv'
is_dataset_directly_supported
Parameters:
Name | Type | Description | Default |
---|---|---|---|
dataset |
Dataset |
Returns:
Type | Description |
---|---|
bool |
View Source
@classmethod
def is_dataset_directly_supported(cls, dataset: Dataset) -> bool:
return isinstance(dataset, PandasDataset)
serialize
Parameters:
Name | Type | Description | Default |
---|---|---|---|
pandas_dataset |
PandasDataset |
Returns:
Type | Description |
---|---|
bytes |
memoryview |
View Source
@classmethod
def serialize(cls, pandas_dataset: PandasDataset) -> bytes | memoryview:
assert isinstance(pandas_dataset, PandasDataset)
def pandas_encode_func(pandas_data: pd.DataFrame) -> memoryview:
csv_bytes = BytesIO()
pandas_data.to_csv(csv_bytes, encoding='utf8', mode='b', index=False)
return csv_bytes.getbuffer()
return cls.create_tarfile_from_dataset(pandas_dataset, data_encode_func=pandas_encode_func)