Skip to content

Commit c7b15fd

Browse files
committed
Save the SpacesBefore on an MWT. Save the rest of the MWT misc annotations on the first word of the MWT. Test both operations
1 parent 0942831 commit c7b15fd

File tree

3 files changed

+45
-15
lines changed

3 files changed

+45
-15
lines changed

data/edu/stanford/nlp/pipeline/en-example-misc-spaceafter.conllu

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
# sent_id = reviews-231203-0002
22
# text = I'm a regular at the HH.
3-
1-2 I'm _ _ _ _ _ _ _ SpacesAfter=\s\s
3+
1-2 I'm _ _ _ _ _ _ _ Foo=Bar|SpacesBefore=\s\s|SpacesAfter=\s\s
44
1 I I PRON PRP Case=Nom|Number=Sing|Person=1|PronType=Prs 4 nsubj 4:nsubj _
55
2 'm be AUX VBP Mood=Ind|Number=Sing|Person=1|Tense=Pres|VerbForm=Fin 4 cop 4:cop _
66
3 a a DET DT Definite=Ind|PronType=Art 4 det 4:det _

itest/src/edu/stanford/nlp/pipeline/CoNLLUReaderITest.java

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -523,6 +523,13 @@ public void testReadingMiscSpaceAfter() throws ClassNotFoundException, IOExcepti
523523
// check the SpaceAfter of the second word, which is where the MWT SpaceAfter should go
524524
assertEquals(" ", tokens.get(1).after());
525525

526+
// the MWT SpacesBefore should go on the first word
527+
assertEquals(" ", tokens.get(0).before());
528+
assertEquals("", tokens.get(1).before());
529+
530+
assertEquals("Foo=Bar", tokens.get(0).get(CoreAnnotations.MWTTokenMiscAnnotation.class));
531+
assertEquals(null, tokens.get(1).get(CoreAnnotations.MWTTokenMiscAnnotation.class));
532+
526533
assertTrue(sentence.containsKey(SemanticGraphCoreAnnotations.BasicDependenciesAnnotation.class));
527534
assertTrue(sentence.containsKey(SemanticGraphCoreAnnotations.EnhancedDependenciesAnnotation.class));
528535
}

src/edu/stanford/nlp/pipeline/CoNLLUReader.java

Lines changed: 37 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -24,8 +24,6 @@ public class CoNLLUReader {
2424
* field constants
2525
**/
2626
// TODO: read sent_id?
27-
// TODO: reconsider the newline as the after on the last word
28-
// TODO: keep around the rest of the misc annotations
2927
public static final int CoNLLU_IndexField = 0;
3028
public static final int CoNLLU_WordField = 1;
3129
public static final int CoNLLU_LemmaField = 2;
@@ -369,6 +367,24 @@ public Annotation convertCoNLLUDocumentToAnnotation(CoNLLUDocument doc) {
369367
return finalAnnotation;
370368
}
371369

370+
public static final String rebuildMisc(Map<String, String> miscKeyValues) {
371+
if (miscKeyValues.size() == 0) {
372+
return null;
373+
}
374+
375+
// rebuild the misc, since we have removed the SpaceAfter, SpacesAfter, and SpacesBefore
376+
StringBuilder misc = new StringBuilder();
377+
for (Map.Entry<String, String> entry : miscKeyValues.entrySet()) {
378+
if (misc.length() > 0) {
379+
misc.append("|");
380+
}
381+
misc.append(entry.getKey());
382+
misc.append("=");
383+
misc.append(entry.getValue());
384+
}
385+
return misc.toString();
386+
}
387+
372388
/**
373389
* Convert a single ten column CoNLLU line into a CoreLabel
374390
*/
@@ -454,6 +470,12 @@ public CoreLabel convertLineToCoreLabel(CoNLLUSentence sentence, String line) {
454470
cl.setIsMWTFirst(false);
455471
} else {
456472
cl.setIsMWTFirst(true);
473+
474+
// if we are first, look for SpacesBefore
475+
String mwtSpacesBefore = mwtKeyValues.get("SpacesBefore");
476+
if (mwtSpacesBefore != null) {
477+
cl.setBefore(unescapeSpacesAfter(mwtSpacesBefore));
478+
}
457479
}
458480
// SpaceAfter / SpacesAfter should only apply to the last word in an MWT
459481
// all other words are treated as implicitly having SpaceAfter=No
@@ -467,6 +489,16 @@ public CoreLabel convertLineToCoreLabel(CoNLLUSentence sentence, String line) {
467489
String spaceAfter = miscToSpaceAfter(mwtKeyValues);
468490
cl.setAfter(spaceAfter);
469491
}
492+
if (cl.isMWTFirst()) {
493+
mwtKeyValues.remove("SpaceAfter");
494+
mwtKeyValues.remove("SpacesAfter");
495+
mwtKeyValues.remove("SpacesBefore");
496+
497+
String mwtMisc = rebuildMisc(mwtKeyValues);
498+
if (mwtMisc != null) {
499+
cl.set(CoreAnnotations.MWTTokenMiscAnnotation.class, mwtMisc);
500+
}
501+
}
470502
} else {
471503
cl.setIsMWT(false);
472504
cl.setIsMWTFirst(false);
@@ -476,18 +508,9 @@ public CoreLabel convertLineToCoreLabel(CoNLLUSentence sentence, String line) {
476508
}
477509
miscKeyValues.remove("SpaceAfter");
478510
miscKeyValues.remove("SpacesAfter");
479-
if (miscKeyValues.size() > 0) {
480-
// rebuild the misc, since we have removed the SpaceAfter, SpacesAfter, and SpacesBefore
481-
StringBuilder misc = new StringBuilder();
482-
for (Map.Entry<String, String> entry : miscKeyValues.entrySet()) {
483-
if (misc.length() > 0) {
484-
misc.append("|");
485-
}
486-
misc.append(entry.getKey());
487-
misc.append("=");
488-
misc.append(entry.getValue());
489-
}
490-
cl.set(CoreAnnotations.CoNLLUMisc.class, misc.toString());
511+
String misc = rebuildMisc(miscKeyValues);
512+
if (misc != null) {
513+
cl.set(CoreAnnotations.CoNLLUMisc.class, misc);
491514
}
492515
return cl;
493516
}

0 commit comments

Comments
 (0)