pimlico.datatypes.parse.candc module

class pimlico.datatypes.parse.candc.CandcOutputCorpus(base_dir, pipeline, raw_data=False)[source]

Bases: pimlico.datatypes.tar.TarredCorpus

data_point_type

alias of CandcOutputDocumentType

datatype_name = 'candc_output'
class pimlico.datatypes.parse.candc.CandcOutputCorpusWriter(base_dir, gzip=False, append=False, trust_length=False, encoding='utf-8', **kwargs)[source]

Bases: pimlico.datatypes.tar.TarredCorpusWriter

document_to_raw_data(data)