Robust way of obtaining POS tags recorded in the ERG treebanks

(I did look in the wiki.)

In the ERG treebanks, we have I think YY tokens and YY lattices (if I understand correctly), stored as part of the derivation. In particular, when loading derivations in pydelphin, I see fields like this:

(1, 0, 1, <0:3>, 1, "The", 0, "null", "DT" 1.0) (2, 1, 2, <4:10>, 1, "ruling", 0, "null", "NN" 1.0) (3, 2, 3, <11:18>, 1, "follows", 0, "null", "VBZ" 1.0) (4, 3, 4, <19:20>, 1, "a", 0, "null", "DT" 1.0) (5, 4, 5, <21:25>, 1, "host", 0, "null", "NN" 1.0) (6, 5, 6, <26:28>, 1, "of", 0, "null", "IN" 1.0) (7, 6, 7, <29:37>, 1, "problems", 0, "null", "NNS" 1.0) (8, 7, 8, <38:40>, 1, "at", 0, "null", "IN" 1.0) (9, 8, 9, <41:47>, 1, "Tucson", 0, "null", "NNP" 1.0) (10, 9, 10, <48:56>, 1, "Electric", 0, "null", "NNP" 1.0) (11, 10, 11, <56:57>, 1, ",", 0, "null", "." 1.0) (12, 11, 12, <58:67>, 1, "including", 0, "null", "VBG" 1.0) (13, 12, 13, <68:73>, 1, "major", 0, "null", "JJ" 1.0) (14, 13, 14, <74:79>, 1, "write", 0, "null", "VBP" 1.0) (15, 14, 15, <79:80>, 1, "-", 0, "null", ":" 1.0) (16, 15, 16, <80:85>, 1, "downs", 0, "null", "NNS" 1.0) (17, 16, 17, <85:86>, 1, ",", 0, "null", "." 1.0) (18, 17, 18, <87:88>, 1, "a", 0, "null", "DT" 1.0) (19, 18, 19, <89:91>, 1, "60", 0, "null", "CD" 1.0) (20, 19, 20, <91:92>, 1, "%", 0, "null", "SYM" 1.0) (21, 20, 21, <93:98>, 1, "slash", 0, "null", "VB" 1.0) (22, 21, 22, <99:101>, 1, "in", 0, "null", "IN" 1.0) (23, 22, 23, <102:105>, 1, "the", 0, "null", "DT" 1.0) (24, 23, 24, <106:112>, 1, "common", 0, "null", "JJ" 1.0) (25, 24, 25, <113:118>, 1, "stock", 0, "null", "NN" 1.0) (26, 25, 26, <119:127>, 1, "dividend", 0, "null", "NN" 1.0) (27, 26, 27, <128:131>, 1, "and", 0, "null", "CC" 1.0) (28, 27, 28, <132:135>, 1, "the", 0, "null", "DT" 1.0) (29, 28, 29, <136:145>, 1, "departure", 0, "null", "NN" 1.0) (30, 29, 30, <146:148>, 1, "of", 0, "null", "IN" 1.0) (31, 30, 31, <149:155>, 1, "former", 0, "null", "JJ" 1.0) (32, 31, 32, <156:164>, 1, "Chairman", 0, "null", "NNP" 1.0) (33, 32, 33, <165:170>, 1, "Einar", 0, "null", "NNP" 1.0) (34, 33, 34, <171:176>, 1, "Greve", 0, "null", "NNP" 1.0) (35, 34, 35, <177:183>, 1, "during", 0, "null", "IN" 1.0) (36, 35, 36, <184:185>, 1, "a", 0, "null", "DT" 1.0) (37, 36, 37, <186:193>, 1, "company", 0, "null", "NN" 1.0) (38, 37, 38, <194:207>, 1, "investigation", 0, "null", "NN" 1.0) (39, 38, 39, <208:210>, 1, "of", 0, "null", "IN" 1.0) (40, 39, 40, <211:214>, 1, "his", 0, "null", "PRP$" 1.0) (41, 40, 41, <215:220>, 1, "stock", 0, "null", "NN" 1.0) (42, 41, 42, <221:226>, 1, "sales", 0, "null", "NNS" 1.0) (43, 42, 43, <226:227>, 1, ".", 0, "null", "." 1.0)

If I understand correctly, the string above is a “YY input” string. These have POS tags, which I want to extract in order to use them as features for the supertagger. However, there is the tokenization issue, because something like “everyone” may in fact be tokenized as “every” and “one”, etc., resuling in a different number of these POS tags compared to the number of tokens in the actual derivation, and it is the actual tokens that I use to build feature vectors. So in this case, regardless of what to do about the specific tag, it is neccessary to be able to tell which span the tag corresponds to in the derivation, not in the YY input. I think that that information is present here, which I think is a YY lattice for the same sentence:

(643, 8, 9, <41:47>, 1, "Tucson", 0, "null") (644, 31, 32, <156:164>, 1, "Chairman", 0, "null") (645, 32, 33, <165:170>, 1, "Einar", 0, "null") (646, 33, 34, <171:176>, 1, "Greve", 0, "null") (647, 9, 10, <48:56>, 1, "Electric", 0, "null") (684, 8, 9, <41:47>, 1, "Tucson", 0, "null") (685, 31, 32, <156:164>, 1, "Chairman", 0, "null") (686, 32, 33, <165:170>, 1, "Einar", 0, "null") (687, 33, 34, <171:176>, 1, "Greve", 0, "null") (688, 9, 10, <48:56>, 1, "Electric", 0, "null") (689, 1, 2, <4:10>, 1, "ruling", 0, "null") (690, 2, 3, <11:18>, 1, "follows", 0, "null") (691, 4, 5, <21:25>, 1, "host", 0, "null") (692, 6, 7, <29:37>, 1, "problems", 0, "null") (693, 11, 12, <58:67>, 1, "including", 0, "null") (694, 12, 13, <68:73>, 1, "major", 0, "null") (695, 13, 14, <74:79>, 1, "write", 0, "null") (696, 20, 21, <93:98>, 1, "slash", 0, "null") (697, 23, 24, <106:112>, 1, "common", 0, "null") (698, 24, 25, <113:118>, 1, "stock", 0, "null") (699, 25, 26, <119:127>, 1, "dividend", 0, "null") (700, 26, 27, <128:131>, 1, "and", 0, "null") (701, 28, 29, <136:145>, 1, "departure", 0, "null") (702, 30, 31, <149:155>, 1, "former", 0, "null") (703, 36, 37, <186:193>, 1, "company", 0, "null") (704, 37, 38, <194:207>, 1, "investigation", 0, "null") (705, 40, 41, <215:220>, 1, "stock", 0, "null") (706, 15, 16, <80:85>, 1, "downs", 0, "null") (707, 41, 42, <221:226>, 1, "sales", 0, "null") (720, 14, 15, <79:80>, 1, "-", 0, "null") (721, 19, 20, <91:92>, 1, "%", 0, "null") (722, 10, 11, <56:57>, 1, ",", 0, "null") (723, 16, 17, <85:86>, 1, ",", 0, "null") (724, 42, 43, <226:227>, 1, ".", 0, "null") (725, 14, 15, <79:80>, 1, "-", 0, "null") (726, 19, 20, <91:92>, 1, "%", 0, "null") (727, 10, 11, <56:57>, 1, ",", 0, "null") (728, 16, 17, <85:86>, 1, ",", 0, "null") (729, 42, 43, <226:227>, 1, ".", 0, "null") (730, 1, 2, <4:10>, 1, "ruling", 0, "null") (731, 1, 2, <4:10>, 1, "ruling", 0, "null") (732, 2, 3, <11:18>, 1, "follows", 0, "null") (733, 2, 3, <11:18>, 1, "follows", 0, "null") (734, 3, 4, <19:20>, 1, "a", 0, "null") (735, 3, 4, <19:20>, 1, "a", 0, "null") (736, 4, 5, <21:25>, 1, "host", 0, "null") (737, 4, 5, <21:25>, 1, "host", 0, "null") (738, 5, 6, <26:28>, 1, "of", 0, "null") (739, 5, 6, <26:28>, 1, "of", 0, "null") (740, 6, 7, <29:37>, 1, "problems", 0, "null") (741, 6, 7, <29:37>, 1, "problems", 0, "null") (742, 7, 8, <38:40>, 1, "at", 0, "null") (743, 7, 8, <38:40>, 1, "at", 0, "null") (744, 11, 12, <58:67>, 1, "including", 0, "null") (745, 11, 12, <58:67>, 1, "including", 0, "null") (746, 12, 13, <68:73>, 1, "major", 0, "null") (747, 12, 13, <68:73>, 1, "major", 0, "null") (748, 13, 14, <74:79>, 1, "write", 0, "null") (749, 13, 14, <74:79>, 1, "write", 0, "null") (750, 17, 18, <87:88>, 1, "a", 0, "null") (751, 17, 18, <87:88>, 1, "a", 0, "null") (752, 20, 21, <93:98>, 1, "slash", 0, "null") (753, 20, 21, <93:98>, 1, "slash", 0, "null") (754, 21, 22, <99:101>, 1, "in", 0, "null") (755, 21, 22, <99:101>, 1, "in", 0, "null") (756, 22, 23, <102:105>, 1, "the", 0, "null") (757, 22, 23, <102:105>, 1, "the", 0, "null") (758, 23, 24, <106:112>, 1, "common", 0, "null") (759, 23, 24, <106:112>, 1, "common", 0, "null") (760, 24, 25, <113:118>, 1, "stock", 0, "null") (761, 24, 25, <113:118>, 1, "stock", 0, "null") (762, 25, 26, <119:127>, 1, "dividend", 0, "null") (763, 25, 26, <119:127>, 1, "dividend", 0, "null") (764, 26, 27, <128:131>, 1, "and", 0, "null") (765, 26, 27, <128:131>, 1, "and", 0, "null") (766, 27, 28, <132:135>, 1, "the", 0, "null") (767, 27, 28, <132:135>, 1, "the", 0, "null") (768, 28, 29, <136:145>, 1, "departure", 0, "null") (769, 28, 29, <136:145>, 1, "departure", 0, "null") (770, 29, 30, <146:148>, 1, "of", 0, "null") (771, 29, 30, <146:148>, 1, "of", 0, "null") (772, 30, 31, <149:155>, 1, "former", 0, "null") (773, 30, 31, <149:155>, 1, "former", 0, "null") (774, 34, 35, <177:183>, 1, "during", 0, "null") (775, 34, 35, <177:183>, 1, "during", 0, "null") (776, 35, 36, <184:185>, 1, "a", 0, "null") (777, 35, 36, <184:185>, 1, "a", 0, "null") (778, 36, 37, <186:193>, 1, "company", 0, "null") (779, 36, 37, <186:193>, 1, "company", 0, "null") (780, 37, 38, <194:207>, 1, "investigation", 0, "null") (781, 37, 38, <194:207>, 1, "investigation", 0, "null") (782, 38, 39, <208:210>, 1, "of", 0, "null") (783, 38, 39, <208:210>, 1, "of", 0, "null") (784, 39, 40, <211:214>, 1, "his", 0, "null") (785, 39, 40, <211:214>, 1, "his", 0, "null") (786, 40, 41, <215:220>, 1, "stock", 0, "null") (787, 40, 41, <215:220>, 1, "stock", 0, "null") (788, 15, 16, <80:85>, 1, "downs", 0, "null") (789, 15, 16, <80:85>, 1, "downs", 0, "null") (790, 41, 42, <221:226>, 1, "sales", 0, "null") (791, 41, 42, <221:226>, 1, "sales", 0, "null") (792, 8, 9, <41:47>, 1, "tucson", 0, "null") (793, 8, 9, <41:47>, 1, "tucson", 0, "null") (794, 31, 32, <156:164>, 1, "chairman", 0, "null") (795, 31, 32, <156:164>, 1, "chairman", 0, "null") (796, 32, 33, <165:170>, 1, "einar", 0, "null") (797, 32, 33, <165:170>, 1, "einar", 0, "null") (798, 33, 34, <171:176>, 1, "greve", 0, "null") (799, 33, 34, <171:176>, 1, "greve", 0, "null") (800, 9, 10, <48:56>, 1, "electric", 0, "null") (801, 9, 10, <48:56>, 1, "electric", 0, "null") (802, 0, 1, <0:3>, 1, "the", 0, "null") (803, 0, 1, <0:3>, 1, "the", 0, "null") (804, 18, 19, <89:91>, 1, "60", 0, "null") (805, 18, 19, <89:91>, 1, "60", 0, "null")

I am able to use this second string, found in the treebank, to create pydelphin YYTokenLattice objects. I could then try to match the spans, or something like that, to be able to determine which POS tag corresponds to which span in the input.

Is there an already implemented robust (ideally, pydelphin) way of doing that? (Yes, I looked in the docs.)

I’m not entirely sure what you’re asking, but the answer is most likely “no”. PyDelphin’s YY token functionality is very basic: it just models it with classes and provides some IO methods.

Also, you may want to use the latest ERG if you care about tokens aligning with other, non-DELPH-IN tools. Maybe you already are.

The two token lattices you’ve shown appear to be the p-input and p-tokens fields of the parse relation. The former is just post-REPP tokenization while the latter is post token-mapping, I think (my memory is hazy about the precise stage of processing they are taken from). Here’s an example:

~/grammars/erg-trunk/tsdb/gold/mrs$ delphin select 'p-input' . | head -1
(1, 0, 1, <0:2>, 1, "It", 0, "null", "PRP" 1.0) (2, 1, 2, <3:9>, 1, "rained", 0, "null", "VBD" 1.0) (3, 2, 3, <9:10>, 1, ".", 0, "null", "." 1.0)
~/grammars/erg-trunk/tsdb/gold/mrs$ delphin select 'p-tokens' . | head -1
(49, 1, 2, <3:9>, 1, "rained", 0, "null") (52, 2, 3, <9:10>, 1, ".", 0, "null") (53, 2, 3, <9:10>, 1, ".", 0, "null") (54, 1, 2, <3:9>, 1, "rained", 0, "null") (55, 1, 2, <3:9>, 1, "rained", 0, "null") (56, 0, 1, <0:2>, 1, "it", 0, "null") (57, 0, 1, <0:2>, 1, "it", 0, "null")

You’ll see the p-input has POS information while the p-tokens does not, presumably because they are already mapped to lexical types (not shown). Another thing to note is that the p-tokens is not printed in surface order, and there are repeated tokens (maybe it’s arranged in order according to the parse chart?). So if you want the POS tags, you’ll need to use the first one or find some way to align the latter to the derivation (e.g., note that both the token ID for “it” and the UDF node ID for “it” are 56):

~/grammars/erg-trunk/tsdb/gold/mrs$ delphin select derivation . | head -1
(root_strict (0 sb-hd_mc_c 0.000000 0 3 (0 it 0.000000 0 1 ("it" 56 "token [ +FORM \\"it\\" +FROM \\"0\\" +TO \\"2\\" +ID *diff-list* [ LIST *cons* [ FIRST \\"0\\" REST *list* ] LAST *list* ] +TNT null_tnt [ +TAGS *null* +PRBS *null* +MAIN tnt_main [ +TAG \\"PRP\\" +PRB \\"1.0\\" ] ] +CLASS alphabetic [ +CASE capitalized+lower +INITIAL + ] +TRAIT token_trait [ +UW - +IT italics +LB bracket_null [ LIST *list* LAST *list* ] +RB bracket_null [ LIST *list* LAST *list* ] +LD bracket_null [ LIST *list* LAST *list* ] +RD bracket_null [ LIST *list* LAST *list* ] +HD token_head [ +TI \\"<0:2>\\" +LL ctype [ -CTYPE- string ] +TG string ] ] +PRED predsort +CARG \\"It\\" +TICK + +ONSET c-or-v-onset ]")) (0 hd-pct_c 0.000000 1 3 (0 v_pst_olr 0.000000 1 2 (0 rain_v1 0.000000 1 2 ("rained" 54 "token [ +FORM \\"rained\\" +FROM \\"3\\" +TO \\"9\\" +ID *diff-list* [ LIST *cons* [ FIRST \\"1\\" REST *list* ] LAST *list* ] +TNT null_tnt [ +TAGS *null* +PRBS *null* +MAIN tnt_main [ +TAG \\"VBD\\" +PRB \\"1.0\\" ] ] +CLASS alphabetic [ +CASE non_capitalized+lower +INITIAL - ] +TRAIT token_trait [ +UW - +IT italics +LB bracket_null [ LIST *list* LAST *list* ] +RB bracket_null [ LIST *list* LAST *list* ] +LD bracket_null [ LIST *list* LAST *list* ] +RD bracket_nonnull [ LIST *cons* [ FIRST n REST *list* ] LAST *list* ] +HD token_head [ +TI \\"<3:9>\\" +LL ctype [ -CTYPE- string ] +TG string ] ] +PRED predsort +CARG \\"rained\\" +TICK + +ONSET c-or-v-onset ]"))) (0 period_pct 0.000000 2 3 ("." 52 "token [ +FORM \\".\\" +FROM \\"9\\" +TO \\"10\\" +ID *diff-list* [ LIST *cons* [ FIRST \\"2\\" REST *list* ] LAST *list* ] +TNT null_tnt [ +TAGS *null* +PRBS *null* +MAIN tnt_main [ +TAG \\".\\" +PRB \\"1.0\\" ] ] +CLASS non_alphanumeric [ +INITIAL - ] +TRAIT token_trait [ +UW - +IT italics +LB bracket_null [ LIST *list* LAST *list* ] +RB bracket_null [ LIST *list* LAST *list* ] +LD bracket_null [ LIST *list* LAST *list* ] +RD bracket_nonnull [ LIST *cons* [ FIRST n REST *list* ] LAST *list* ] +HD token_head [ +TI \\"<9:10>\\" +LL ctype [ -CTYPE- string ] +TG string ] ] +PRED predsort +CARG \\".\\" +TICK + +ONSET c-or-v-onset ]")))))

Otherwise, if you just want token spans and POS tags, the p-input would be sufficient:

>>> from delphin.tokens import YYTokenLattice
>>> s = '''(1, 0, 1, <0:3>, 1, "The", 0, "null", "DT" 1.0) (2, 1, 2, <4:10>, 1, "ruling", 0, "null", "NN" 1.0) (3, 2, 3, <11:18>, 1, "follows", 0, "null", "VBZ" 1.0) (4, 3, 4, <19:20>, 1, "a", 0, "null", "DT" 1.0) (5, 4, 5, <21:25>, 1, "host", 0, "null", "NN" 1.0) (6, 5, 6, <26:28>, 1, "of", 0, "null", "IN" 1.0) (7, 6, 7, <29:37>, 1, "problems", 0, "null", "NNS" 1.0) (8, 7, 8, <38:40>, 1, "at", 0, "null", "IN" 1.0) (9, 8, 9, <41:47>, 1, "Tucson", 0, "null", "NNP" 1.0) (10, 9, 10, <48:56>, 1, "Electric", 0, "null", "NNP" 1.0) (11, 10, 11, <56:57>, 1, ",", 0, "null", "." 1.0) (12, 11, 12, <58:67>, 1, "including", 0, "null", "VBG" 1.0) (13, 12, 13, <68:73>, 1, "major", 0, "null", "JJ" 1.0) (14, 13, 14, <74:79>, 1, "write", 0, "null", "VBP" 1.0) (15, 14, 15, <79:80>, 1, "-", 0, "null", ":" 1.0) (16, 15, 16, <80:85>, 1, "downs", 0, "null", "NNS" 1.0) (17, 16, 17, <85:86>, 1, ",", 0, "null", "." 1.0) (18, 17, 18, <87:88>, 1, "a", 0, "null", "DT" 1.0) (19, 18, 19, <89:91>, 1, "60", 0, "null", "CD" 1.0) (20, 19, 20, <91:92>, 1, "%", 0, "null", "SYM" 1.0) (21, 20, 21, <93:98>, 1, "slash", 0, "null", "VB" 1.0) (22, 21, 22, <99:101>, 1, "in", 0, "null", "IN" 1.0) (23, 22, 23, <102:105>, 1, "the", 0, "null", "DT" 1.0) (24, 23, 24, <106:112>, 1, "common", 0, "null", "JJ" 1.0) (25, 24, 25, <113:118>, 1, "stock", 0, "null", "NN" 1.0) (26, 25, 26, <119:127>, 1, "dividend", 0, "null", "NN" 1.0) (27, 26, 27, <128:131>, 1, "and", 0, "null", "CC" 1.0) (28, 27, 28, <132:135>, 1, "the", 0, "null", "DT" 1.0) (29, 28, 29, <136:145>, 1, "departure", 0, "null", "NN" 1.0) (30, 29, 30, <146:148>, 1, "of", 0, "null", "IN" 1.0) (31, 30, 31, <149:155>, 1, "former", 0, "null", "JJ" 1.0) (32, 31, 32, <156:164>, 1, "Chairman", 0, "null", "NNP" 1.0) (33, 32, 33, <165:170>, 1, "Einar", 0, "null", "NNP" 1.0) (34, 33, 34, <171:176>, 1, "Greve", 0, "null", "NNP" 1.0) (35, 34, 35, <177:183>, 1, "during", 0, "null", "IN" 1.0) (36, 35, 36, <184:185>, 1, "a", 0, "null", "DT" 1.0) (37, 36, 37, <186:193>, 1, "company", 0, "null", "NN" 1.0) (38, 37, 38, <194:207>, 1, "investigation", 0, "null", "NN" 1.0) (39, 38, 39, <208:210>, 1, "of", 0, "null", "IN" 1.0) (40, 39, 40, <211:214>, 1, "his", 0, "null", "PRP$" 1.0) (41, 40, 41, <215:220>, 1, "stock", 0, "null", "NN" 1.0) (42, 41, 42, <221:226>, 1, "sales", 0, "null", "NNS" 1.0) (43, 42, 43, <226:227>, 1, ".", 0, "null", "." 1.0)'''
>>> lattice = tokens.YYTokenLattice.from_string(s)
>>> for token in lattice.tokens:
...     print(token.lnk.data,  # cfrom, cto (see notes below)
...           token.form,
...           token.pos)
... 
(0, 3) The [('DT', 1.0)]
(4, 10) ruling [('NN', 1.0)]
(11, 18) follows [('VBZ', 1.0)]
(19, 20) a [('DT', 1.0)]
(21, 25) host [('NN', 1.0)]
(26, 28) of [('IN', 1.0)]
(29, 37) problems [('NNS', 1.0)]
(38, 40) at [('IN', 1.0)]
(41, 47) Tucson [('NNP', 1.0)]
(48, 56) Electric [('NNP', 1.0)]
(56, 57) , [('.', 1.0)]
(58, 67) including [('VBG', 1.0)]
(68, 73) major [('JJ', 1.0)]
(74, 79) write [('VBP', 1.0)]
(79, 80) - [(':', 1.0)]
(80, 85) downs [('NNS', 1.0)]
(85, 86) , [('.', 1.0)]
(87, 88) a [('DT', 1.0)]
(89, 91) 60 [('CD', 1.0)]
(91, 92) % [('SYM', 1.0)]
(93, 98) slash [('VB', 1.0)]
(99, 101) in [('IN', 1.0)]
(102, 105) the [('DT', 1.0)]
(106, 112) common [('JJ', 1.0)]
(113, 118) stock [('NN', 1.0)]
(119, 127) dividend [('NN', 1.0)]
(128, 131) and [('CC', 1.0)]
(132, 135) the [('DT', 1.0)]
(136, 145) departure [('NN', 1.0)]
(146, 148) of [('IN', 1.0)]
(149, 155) former [('JJ', 1.0)]
(156, 164) Chairman [('NNP', 1.0)]
(165, 170) Einar [('NNP', 1.0)]
(171, 176) Greve [('NNP', 1.0)]
(177, 183) during [('IN', 1.0)]
(184, 185) a [('DT', 1.0)]
(186, 193) company [('NN', 1.0)]
(194, 207) investigation [('NN', 1.0)]
(208, 210) of [('IN', 1.0)]
(211, 214) his [('PRP$', 1.0)]
(215, 220) stock [('NN', 1.0)]
(221, 226) sales [('NNS', 1.0)]
(226, 227) . [('.', 1.0)]

Note that the token.lnk.data contains cfrom/cto iff:

token.lnk is not None and token.lnk.type == delphin.lnk.Lnk.CHARSPAN

and this depends on how the profile was processed.

Thanks, @goodmami !

I am using the latest version of the ERG.

The problem is, I cannot use p-input directly because it does not correspond to the tokens necessarily (even in the latest ERG). And the p-tokens do not include POS tags (at least in the treebanks).

Example below. note the word “Everytime” and how it corresponds to two tokens “every” and “time”:

p-input:

(1, 0, 1, <0:1>, 1, "“", 0, "null", "RB" 1.0) (2, 1, 2, <1:10>, 1, "Everytime", 0, "null", "NNP" 1.0) (3, 2, 3, <11:13>, 1, "we", 0, "null", "PRP" 1.0) (4, 3, 4, <14:20>, 1, "talked", 0, "null", "VBD" 1.0) (5, 4, 5, <21:26>, 1, "about", 0, "null", "IN" 1.0) (6, 5, 6, <27:34>, 1, "Blinder", 0, "null", "NNP" 1.0) (7, 6, 7, <35:48>, 1, "International", 0, "null", "NNP" 1.0) (8, 7, 8, <48:49>, 1, ",", 0, "null", "." 1.0) (9, 8, 9, <50:51>, 1, "{", 0, "null", "." 1.0) (10, 9, 10, <51:57>, 1, "people", 0, "null", "NNS" 1.0) (11, 10, 11, <57:58>, 1, "}", 0, "null", "." 1.0) (12, 11, 12, <59:66>, 1, "thought", 0, "null", "VBD" 1.0) (13, 12, 13, <67:69>, 1, "it", 0, "null", "PRP" 1.0) (14, 13, 14, <70:73>, 1, "was", 0, "null", "VBD" 1.0) (15, 14, 15, <74:77>, 1, "the", 0, "null", "DT" 1.0) (16, 15, 16, <78:87>, 1, "brokerage", 0, "null", "NN" 1.0) (17, 16, 17, <88:93>, 1, "house", 0, "null", "NN" 1.0) (18, 17, 18, <93:94>, 1, ".", 0, "null", "." 1.0) (19, 18, 19, <94:95>, 1, "”", 0, "null", "NN" 1.0)

p-tokens:

(307, 6, 7, <27:34>, 1, "Blinder", 0, "null") (308, 7, 8, <35:48>, 1, "International", 0, "null") (326, 1, 2, <1:10>, 1, "Every", 0, "null") (327, 6, 7, <27:34>, 1, "Blinder", 0, "null") (328, 7, 8, <35:48>, 1, "International", 0, "null") (329, 4, 5, <14:20>, 1, "talked", 0, "null") (330, 10, 11, <51:57>, 1, "people", 0, "null") (331, 12, 13, <59:66>, 1, "thought", 0, "null") (332, 14, 15, <70:73>, 1, "was", 0, "null") (333, 16, 17, <78:87>, 1, "brokerage", 0, "null") (334, 2, 3, <1:10>, 1, "time", 0, "null") (335, 17, 18, <88:93>, 1, "house", 0, "null") (345, 9, 10, <50:51>, 1, "{", 0, "null") (346, 11, 12, <57:58>, 1, "}", 0, "null") (347, 8, 9, <48:49>, 1, ",", 0, "null") (348, 18, 19, <93:94>, 1, ".", 0, "null") (349, 19, 20, <94:95>, 1, "”", 0, "null") (350, 0, 1, <0:1>, 1, "“", 0, "null") (351, 9, 10, <50:51>, 1, "{", 0, "null") (352, 11, 12, <57:58>, 1, "}", 0, "null") (353, 8, 9, <48:49>, 1, ",", 0, "null") (354, 18, 19, <93:94>, 1, ".", 0, "null") (355, 19, 20, <94:95>, 1, "”", 0, "null") (356, 0, 1, <0:1>, 1, "“", 0, "null") (357, 3, 4, <11:13>, 1, "we", 0, "null") (358, 3, 4, <11:13>, 1, "we", 0, "null") (359, 4, 5, <14:20>, 1, "talked", 0, "null") (360, 4, 5, <14:20>, 1, "talked", 0, "null") (361, 5, 6, <21:26>, 1, "about", 0, "null") (362, 5, 6, <21:26>, 1, "about", 0, "null") (363, 10, 11, <51:57>, 1, "people", 0, "null") (364, 10, 11, <51:57>, 1, "people", 0, "null") (365, 12, 13, <59:66>, 1, "thought", 0, "null") (366, 12, 13, <59:66>, 1, "thought", 0, "null") (367, 13, 14, <67:69>, 1, "it", 0, "null") (368, 13, 14, <67:69>, 1, "it", 0, "null") (369, 14, 15, <70:73>, 1, "was", 0, "null") (370, 14, 15, <70:73>, 1, "was", 0, "null") (371, 15, 16, <74:77>, 1, "the", 0, "null") (372, 15, 16, <74:77>, 1, "the", 0, "null") (373, 16, 17, <78:87>, 1, "brokerage", 0, "null") (374, 16, 17, <78:87>, 1, "brokerage", 0, "null") (375, 2, 3, <1:10>, 1, "time", 0, "null") (376, 2, 3, <1:10>, 1, "time", 0, "null") (377, 17, 18, <88:93>, 1, "house", 0, "null") (378, 17, 18, <88:93>, 1, "house", 0, "null") (379, 6, 7, <27:34>, 1, "blinder", 0, "null") (380, 6, 7, <27:34>, 1, "blinder", 0, "null") (381, 7, 8, <35:48>, 1, "international", 0, "null") (382, 7, 8, <35:48>, 1, "international", 0, "null") (383, 1, 2, <1:10>, 1, "every", 0, "null") (384, 1, 2, <1:10>, 1, "every", 0, "null")

So, the answer appears to be, one would need to write some code mapping the words (or rather, their POS tags) in the p-input to the actual tokens in p-tokens, yes? Probably using the span attribute which seems to be robustly present in both fields? I am not sure about the “id” since I am not seeing that in the p-input, I don’t think. E.g. in order to find a correct POS tag for “every” and “time”, as opposed to the “everytime”, which, while present in p-input, isn’t present in the derivation’s actual terminals?

“The first one” is the p-input field, right? That has the POS tags. But that can’t be mapped to the derivation directly due to tokenization issues, unless I am missing something. If by “the latter” you mean p-tokens, then I don’t need to map that to the derivation, because I already have the tokens that I can get by the derivation.terminals() method. The problem is the lack of direct mapping between the POS tags and the tokens…

In the end, I wrote the following somewhat pasta-looking function which returns a list of tuples in which each derivation terminal is accompanied by a list of its tokens with the tokens’ corresponding POS tags and their probabilities:

    def map_lattice_to_input(self, p_input, p_tokens, deriv):
        yy_lattice = YYTokenLattice.from_string(p_tokens)
        yy_input = YYTokenLattice.from_string(p_input)
        terminals_toks_postags = []
        for t in deriv.terminals():
            toks_pos_tags = []
            for ttok in t.tokens:
                span = None
                pos_probs = {}
                for lat_tok in yy_lattice.tokens:
                    if lat_tok.id == ttok.id:
                        span = lat_tok.lnk.data
                        break
                for i,in_tok in enumerate(yy_input.tokens):
                    if in_tok.lnk.data[0] == span[0]:
                        for pos, p in in_tok.pos:
                            if pos not in pos_probs:
                                pos_probs[pos] = []
                            pos_probs[pos].append(float(p))
                        if in_tok.lnk.data[1] != span[1]:
                            cur_tok = in_tok
                            while cur_tok.lnk.data[1] != span[1]:
                                next_tok = yy_input.tokens[i+1]
                                i += 1
                                for pos, p in next_tok.pos:
                                    if pos not in pos_probs:
                                        pos_probs[pos] = []
                                    pos_probs[pos].append(float(p))
                                cur_tok = next_tok
                        else:
                            break
                toks_pos_tags.append((ttok, pos_probs))
            terminals_toks_postags.append((t,toks_pos_tags))
        return terminals_toks_postags

The messiest part has to do with the fact that the input tokens may have different spans than lattice tokens… If anyone finds themselves reading the code and sees a bug, do let me know :). Otherwise, for now this is the way I am using to obtain POS tags.

1 Like