@@ -24,8 +24,6 @@ public class CoNLLUReader {
24
24
* field constants
25
25
**/
26
26
// TODO: read sent_id?
27
- // TODO: reconsider the newline as the after on the last word
28
- // TODO: keep around the rest of the misc annotations
29
27
public static final int CoNLLU_IndexField = 0 ;
30
28
public static final int CoNLLU_WordField = 1 ;
31
29
public static final int CoNLLU_LemmaField = 2 ;
@@ -369,6 +367,24 @@ public Annotation convertCoNLLUDocumentToAnnotation(CoNLLUDocument doc) {
369
367
return finalAnnotation ;
370
368
}
371
369
370
+ public static final String rebuildMisc (Map <String , String > miscKeyValues ) {
371
+ if (miscKeyValues .size () == 0 ) {
372
+ return null ;
373
+ }
374
+
375
+ // rebuild the misc, since we have removed the SpaceAfter, SpacesAfter, and SpacesBefore
376
+ StringBuilder misc = new StringBuilder ();
377
+ for (Map .Entry <String , String > entry : miscKeyValues .entrySet ()) {
378
+ if (misc .length () > 0 ) {
379
+ misc .append ("|" );
380
+ }
381
+ misc .append (entry .getKey ());
382
+ misc .append ("=" );
383
+ misc .append (entry .getValue ());
384
+ }
385
+ return misc .toString ();
386
+ }
387
+
372
388
/**
373
389
* Convert a single ten column CoNLLU line into a CoreLabel
374
390
*/
@@ -454,6 +470,12 @@ public CoreLabel convertLineToCoreLabel(CoNLLUSentence sentence, String line) {
454
470
cl .setIsMWTFirst (false );
455
471
} else {
456
472
cl .setIsMWTFirst (true );
473
+
474
+ // if we are first, look for SpacesBefore
475
+ String mwtSpacesBefore = mwtKeyValues .get ("SpacesBefore" );
476
+ if (mwtSpacesBefore != null ) {
477
+ cl .setBefore (unescapeSpacesAfter (mwtSpacesBefore ));
478
+ }
457
479
}
458
480
// SpaceAfter / SpacesAfter should only apply to the last word in an MWT
459
481
// all other words are treated as implicitly having SpaceAfter=No
@@ -467,6 +489,16 @@ public CoreLabel convertLineToCoreLabel(CoNLLUSentence sentence, String line) {
467
489
String spaceAfter = miscToSpaceAfter (mwtKeyValues );
468
490
cl .setAfter (spaceAfter );
469
491
}
492
+ if (cl .isMWTFirst ()) {
493
+ mwtKeyValues .remove ("SpaceAfter" );
494
+ mwtKeyValues .remove ("SpacesAfter" );
495
+ mwtKeyValues .remove ("SpacesBefore" );
496
+
497
+ String mwtMisc = rebuildMisc (mwtKeyValues );
498
+ if (mwtMisc != null ) {
499
+ cl .set (CoreAnnotations .MWTTokenMiscAnnotation .class , mwtMisc );
500
+ }
501
+ }
470
502
} else {
471
503
cl .setIsMWT (false );
472
504
cl .setIsMWTFirst (false );
@@ -476,18 +508,9 @@ public CoreLabel convertLineToCoreLabel(CoNLLUSentence sentence, String line) {
476
508
}
477
509
miscKeyValues .remove ("SpaceAfter" );
478
510
miscKeyValues .remove ("SpacesAfter" );
479
- if (miscKeyValues .size () > 0 ) {
480
- // rebuild the misc, since we have removed the SpaceAfter, SpacesAfter, and SpacesBefore
481
- StringBuilder misc = new StringBuilder ();
482
- for (Map .Entry <String , String > entry : miscKeyValues .entrySet ()) {
483
- if (misc .length () > 0 ) {
484
- misc .append ("|" );
485
- }
486
- misc .append (entry .getKey ());
487
- misc .append ("=" );
488
- misc .append (entry .getValue ());
489
- }
490
- cl .set (CoreAnnotations .CoNLLUMisc .class , misc .toString ());
511
+ String misc = rebuildMisc (miscKeyValues );
512
+ if (misc != null ) {
513
+ cl .set (CoreAnnotations .CoNLLUMisc .class , misc );
491
514
}
492
515
return cl ;
493
516
}
0 commit comments