pimlico.datatypes.parse.candc module

class CandcOutputCorpus(base_dir, pipeline, **kwargs)[source]

Bases: pimlico.datatypes.tar.TarredCorpus

datatype_name = 'candc_output'
data_point_type

alias of CandcOutputDocumentType

class CandcOutputCorpusWriter(base_dir, gzip=False, append=False, trust_length=False, encoding='utf-8', **kwargs)[source]

Bases: pimlico.datatypes.tar.TarredCorpusWriter

document_to_raw_data(data)