Skip to content

Commit e5d494e

Browse files
committed
Keep the Misc fields on a CoreLabel
1 parent a9bef7d commit e5d494e

File tree

2 files changed

+31
-1
lines changed

2 files changed

+31
-1
lines changed

itest/src/edu/stanford/nlp/pipeline/CoNLLUReaderITest.java

Lines changed: 15 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,10 @@ public class CoNLLUReaderITest {
5252
{"sps00", "rg", "da0fp0", "ncfp000", "fp"},
5353
};
5454

55+
static final String[][] EXPECTED_MISC = {
56+
{null, null, "ArgTem=arg1:tem", null, null, "ArgTem=argM:tmp", "ArgTem=arg1:tem", null, null, null, "ArgTem=argM:loc", "ArgTem=argM:adv", null, null, null, null, null, "ArgTem=arg2:atr", null, "ArgTem=arg0:agt", null},
57+
{null, null, null, null, null},
58+
};
5559

5660
static final String[][] EXPECTED_FEATS = {
5761
{
@@ -232,11 +236,12 @@ public void testReadingInCoNLLUFile() throws ClassNotFoundException, IOException
232236
}
233237
}
234238

235-
// check the features and that there are no fields currently unaccounted for
239+
// check the features, the misc columns, and that there are no fields currently unaccounted for
236240
for (int i = 0; i < sentences.size(); ++i) {
237241
CoreMap sentence = sentences.get(i);
238242
List<CoreLabel> tokens = sentence.get(CoreAnnotations.TokensAnnotation.class);
239243
assertEquals(EXPECTED_FEATS[i].length, tokens.size());
244+
assertEquals(EXPECTED_MISC[i].length, tokens.size());
240245
for (int j = 0; j < tokens.size(); ++j) {
241246
CoreLabel token = tokens.get(j);
242247

@@ -251,6 +256,15 @@ public void testReadingInCoNLLUFile() throws ClassNotFoundException, IOException
251256
assertEquals(expected, feats);
252257
}
253258

259+
String expectedMisc = EXPECTED_MISC[i][j];
260+
if (expectedMisc == null) {
261+
assertFalse(token.containsKey(CoreAnnotations.CoNLLUMisc.class));
262+
} else {
263+
expectedKeys += 1;
264+
String misc = token.get(CoreAnnotations.CoNLLUMisc.class).toString();
265+
assertEquals(expectedMisc, misc);
266+
}
267+
254268
// Some of the AnCora sentences don't have XPOS
255269
if (token.containsKey(CoreAnnotations.PartOfSpeechAnnotation.class)) {
256270
expectedKeys += 1;

src/edu/stanford/nlp/pipeline/CoNLLUReader.java

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -427,6 +427,7 @@ public CoreLabel convertLineToCoreLabel(CoNLLUSentence sentence, String line) {
427427
String spacesBefore = miscKeyValues.get("SpacesBefore");
428428
if (spacesBefore != null) {
429429
cl.setBefore(unescapeSpacesAfter(spacesBefore));
430+
miscKeyValues.remove("SpacesBefore");
430431
}
431432

432433
// handle the MWT info and after text
@@ -474,6 +475,21 @@ public CoreLabel convertLineToCoreLabel(CoNLLUSentence sentence, String line) {
474475
String spaceAfter = miscToSpaceAfter(miscKeyValues);
475476
cl.setAfter(spaceAfter);
476477
}
478+
miscKeyValues.remove("SpaceAfter");
479+
miscKeyValues.remove("SpacesAfter");
480+
if (miscKeyValues.size() > 0) {
481+
// rebuild the misc, since we have removed the SpaceAfter, SpacesAfter, and SpacesBefore
482+
StringBuilder misc = new StringBuilder();
483+
for (Map.Entry<String, String> entry : miscKeyValues.entrySet()) {
484+
if (misc.length() > 0) {
485+
misc.append("|");
486+
}
487+
misc.append(entry.getKey());
488+
misc.append("=");
489+
misc.append(entry.getValue());
490+
}
491+
cl.set(CoreAnnotations.CoNLLUMisc.class, misc.toString());
492+
}
477493
return cl;
478494
}
479495

0 commit comments

Comments
 (0)