Source code for text2array.batches

# Copyright 2019 Kemal Kurniawan
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from collections import UserList
from functools import reduce
from typing import Dict, List, Mapping, MutableSequence, Optional, Sequence, Union, cast

import numpy as np  # type: ignore

from .samples import FieldName, FieldValue, Sample


[docs]class Batch(UserList, MutableSequence[Sample]): """A class to represent a single batch. Args: samples (~typing.Sequence[Sample]): Sequence of samples this batch should contain. """ def __init__(self, samples: Optional[Sequence[Sample]] = None) -> None: # constructor required; see https://docs.python.org/3.6/library/collections.html#collections.UserList if samples is None: samples = [] super().__init__(samples)
[docs] def to_array( self, pad_with: Union[int, float, bool, Mapping[FieldName, Union[int, float, bool]]] = 0, ) -> Dict[FieldName, np.ndarray]: """Convert the batch into `~numpy.ndarray`. Args: pad_with: Pad sequential field values with this value. Can also be a mapping from field names to padding value for that field. Fields whose name is not in the mapping will be padded with zeros. Returns: A mapping from field names to arrays whose first dimension corresponds to the batch size as returned by `len`. """ if not self: return {} field_names = self[0].keys() if isinstance(pad_with, int): pad_dict = {name: pad_with for name in field_names} else: pad_dict = cast(dict, pad_with) arr = {} for name in field_names: values = self._get_values(name) # Get max length for all depths, 1st elem is batch size try: maxlens = self._get_maxlens(values) except self._InconsistentDepthError: raise ValueError(f"field '{name}' has inconsistent nesting depth") # Get padding for all depths paddings = self._get_paddings(maxlens, pad_dict.get(name, 0)) # Pad the values data = self._pad(values, maxlens, paddings, 0) arr[name] = np.array(data) return arr
def _get_values(self, name: str) -> Sequence[FieldValue]: try: return [s[name] for s in self] except KeyError: raise KeyError(f"some samples have no field '{name}'") @classmethod def _get_maxlens(cls, values: Sequence[FieldValue]) -> List[int]: assert values # Base case if isinstance(values[0], str) or not isinstance(values[0], Sequence): return [len(values)] # Recursive case maxlenss = [cls._get_maxlens(x) for x in values] # type: ignore if not all(len(x) == len(maxlenss[0]) for x in maxlenss): raise cls._InconsistentDepthError maxlens = reduce(lambda ml1, ml2: [max(l1, l2) for l1, l2 in zip(ml1, ml2)], maxlenss) maxlens.insert(0, len(values)) return maxlens @classmethod def _get_paddings(cls, maxlens: List[int], with_: int) -> List[Union[int, List[int]]]: res: list = [with_] for maxlen in reversed(maxlens[1:]): res.append([res[-1] for _ in range(maxlen)]) res.reverse() return res @classmethod def _pad( cls, values: Sequence[FieldValue], maxlens: List[int], paddings: List[Union[int, List[int]]], depth: int, ) -> Sequence[FieldValue]: assert values assert len(maxlens) == len(paddings) assert depth < len(maxlens) # Base case if isinstance(values[0], str) or not isinstance(values[0], Sequence): values_ = list(values) # Recursive case else: values_ = [ cls._pad(x, maxlens, paddings, depth + 1) for x in values # type: ignore ] for _ in range(maxlens[depth] - len(values)): values_.append(paddings[depth]) return values_ class _InconsistentDepthError(Exception): pass