# This file is part of Pimlico
# Copyright (C) 2020 Mark Granroth-Wilding
# Licensed under the GNU LGPL v3.0 - https://www.gnu.org/licenses/lgpl-3.0.en.html
from builtins import map
# CCGBank has some POS tags not in the PTB tagset
CCGBANK_TO_PTB_POS_MAP = {
"RRB": ")",
"LRB": "(",
"SO": "RB",
"LQU": "``",
"RQU": "''",
}
# This is the set of POS tags the Malt pre-trained model recognises, namely the PTB extended tagset
# If Malt gets something outside this set it will fail
PTB_POS_TAGS = [
"IN", "DT", "NNP", "CD", "NN", "``", "''", "POS", "(", "VBN", "NNS", "VBP", ",", "CC", ")", "VBD", "RB", "TO",
".", "VBZ", "NNPS", "PRP", "PRP$", "VB", "JJ", "MD", "VBG", "RBR", ":", "WP", "WDT", "JJR", "PDT", "RBS", "WRB",
"JJS", "$", "RP", "FW", "EX", "SYM", "#", "LS", "UH", "WP$", "PRT"
]
[docs]def pos_tag_to_ptb(tag):
"""
see :doc:pos_pos_tags_to_ptb
"""
if tag in PTB_POS_TAGS:
# Fine already
return tag
elif tag in CCGBANK_TO_PTB_POS_MAP:
# Perform the CCGBank mapping
return CCGBANK_TO_PTB_POS_MAP[tag]
else:
raise NonPTBTagError("POS tag '%s' is not a valid PTB tag" % tag)
[docs]class NonPTBTagError(Exception):
pass