Skip to content

Omnipy

tasks

omnipy.components.raw.tasks

Tasks for decoding, editing, concatenating, and unioning raw datasets.

FUNCTION	DESCRIPTION
`concat_all`	Concatenate all dataset values using their native addition semantics.
`decode_bytes`	Decode each binary data file to text, auto-detecting encoding when none is supplied.
`modify_all_lines`	Apply a callable to stripped lines and join the result with OS-specific newlines.
`modify_datafile_content`	Apply a callable to each full text data file.
`modify_each_line`	Apply a callable to each line and rebuild the text from returned lines.
`union_all`	Union all dataset values using their native set-like merge semantics.

concat_all

concat_all(dataset: Dataset[_SequenceModelT]) -> _SequenceModelT

Concatenate all dataset values using their native addition semantics.

Source code in src/omnipy/components/raw/tasks.py

@TaskTemplate()
def concat_all(dataset: Dataset[_SequenceModelT]) -> _SequenceModelT:
    """Concatenate all dataset values using their native addition semantics."""

    return reduce(add, (val for val in dataset.values()))

decode_bytes

decode_bytes(data: Model[bytes], encoding: str | None = None) -> str

Decode each binary data file to text, auto-detecting encoding when none is supplied.

Source code in src/omnipy/components/raw/tasks.py

@TaskTemplate(iterate_over_data_files=True, output_dataset_cls=StrDataset)
def decode_bytes(data: Model[bytes], encoding: str | None = None) -> str:
    """Decode each binary data file to text, auto-detecting encoding when none is supplied."""

    if encoding is None:
        detector = UniversalDetector()
        for line in data.splitlines():
            detector.feed(line)
            if detector.done:
                break
        detector.close()
        result = detector.result

        encoding = result['encoding']
        confidence = result['confidence']
        language = result['language']

        # TODO: Implement simple solution to log from a task/flow.
        # TODO: Implement solution to add information to the dataset metadata and apply this to
        #       decode_bytes() for storing detected encoding etc.
        print(f'Automatically detected text encoding to be "{encoding}" with confidence '
              f'"{confidence}". The language is predicted to be "{language}". '
              f'(All predictions have been made by the "chardet" library.)')

        if encoding is None:
            encoding = 'ascii'

    return data.decode(encoding)

modify_all_lines

modify_all_lines(
    data_file: Model[str], modify_all_lines_func: IsModifyAllLinesCallable, **kwargs: object
) -> str

Apply a callable to stripped lines and join the result with OS-specific newlines.

Source code in src/omnipy/components/raw/tasks.py

@TaskTemplate(iterate_over_data_files=True)
def modify_all_lines(
    data_file: Model[str],
    modify_all_lines_func: IsModifyAllLinesCallable,
    **kwargs: object,
) -> str:
    """Apply a callable to stripped lines and join the result with OS-specific newlines."""

    all_lines = [line.strip() for line in StringIO(str(data_file))]
    modified_lines = modify_all_lines_func(all_lines, **kwargs)
    return os.linesep.join(modified_lines)

modify_datafile_content

modify_datafile_content(
    data_file: Model[str], modify_content_func: IsModifyContentCallable, **kwargs: object
) -> str

Apply a callable to each full text data file.

Source code in src/omnipy/components/raw/tasks.py

@TaskTemplate(iterate_over_data_files=True)
def modify_datafile_content(
    data_file: Model[str],
    modify_content_func: IsModifyContentCallable,
    **kwargs: object,
) -> str:
    """Apply a callable to each full text data file."""

    return modify_content_func(str(data_file), **kwargs)

modify_each_line

modify_each_line(
    data_file: Model[str], modify_line_func: IsModifyEachLineCallable, **kwargs: object
) -> str

Apply a callable to each line and rebuild the text from returned lines.

Source code in src/omnipy/components/raw/tasks.py

@TaskTemplate(iterate_over_data_files=True)
def modify_each_line(
    data_file: Model[str],
    modify_line_func: IsModifyEachLineCallable,
    **kwargs: object,
) -> str:
    """Apply a callable to each line and rebuild the text from returned lines."""

    output_data = StringIO()
    for i, line in enumerate(StringIO(str(data_file))):
        modified_line = modify_line_func(i, line, **kwargs)
        if modified_line is not None:
            output_data.write(modified_line)
    return output_data.getvalue()

union_all

union_all(dataset: Dataset[_UniqueModelT]) -> _UniqueModelT

Union all dataset values using their native set-like merge semantics.

Source code in src/omnipy/components/raw/tasks.py

@TaskTemplate()
def union_all(dataset: Dataset[_UniqueModelT]) -> _UniqueModelT:
    """Union all dataset values using their native set-like merge semantics."""

    all_vals = tuple(val for val in dataset.values())
    assert len(all_vals) > 0
    first_val = deepcopy(all_vals[0])

    return reduce(ior, chain((first_val,), all_vals[1:]))