@@ -326,15 +326,14 @@ public Annotation convertCoNLLUDocumentToAnnotation(CoNLLUDocument doc) {
326
326
// build sentences
327
327
List <CoreMap > sentences = new ArrayList <>();
328
328
for (CoNLLUSentence sent : doc .sentences ) {
329
- sentences .add (convertCoNLLUSentenceToCoreMap (doc , sent ));
329
+ // pass in the sentences.size() so we can build the CoreLabels with the correct sentIndex()
330
+ // this way, we don't mess up the hashCodes later
331
+ sentences .add (convertCoNLLUSentenceToCoreMap (doc , sent , sentences .size ()));
330
332
}
331
333
// set sentences
332
334
finalAnnotation .set (CoreAnnotations .SentencesAnnotation .class , sentences );
333
335
// build document wide CoreLabels list
334
- // TODO: do we need to put new SentenceIndexAnnotations on each of the IndexedWords?
335
- // TODO: what about document annotation?
336
- // We should confirm that setting the SentenceIndexAnnotation like this isn't
337
- // distorting any of the SemanticGraphs
336
+ // TODO: should we set document annotation?
338
337
List <CoreLabel > tokens = new ArrayList <>();
339
338
finalAnnotation .set (CoreAnnotations .TokensAnnotation .class , tokens );
340
339
int documentIdx = 0 ;
@@ -351,15 +350,9 @@ public Annotation convertCoNLLUDocumentToAnnotation(CoNLLUDocument doc) {
351
350
for (CoreLabel token : sentence .get (CoreAnnotations .TokensAnnotation .class )) {
352
351
token .set (CoreAnnotations .TokenBeginAnnotation .class , documentIdx );
353
352
token .set (CoreAnnotations .TokenEndAnnotation .class , documentIdx + 1 );
354
- token .set (CoreAnnotations .SentenceIndexAnnotation .class , sentenceIdx );
355
353
tokens .add (token );
356
354
documentIdx ++;
357
355
}
358
- if (sentence .containsKey (CoreAnnotations .EmptyTokensAnnotation .class )) {
359
- for (CoreLabel token : sentence .get (CoreAnnotations .EmptyTokensAnnotation .class )) {
360
- token .set (CoreAnnotations .SentenceIndexAnnotation .class , sentenceIdx );
361
- }
362
- }
363
356
sentenceIdx ++;
364
357
}
365
358
// make sure to set docText AFTER all the above processing
@@ -389,9 +382,10 @@ public static final String rebuildMisc(Map<String, String> miscKeyValues) {
389
382
/**
390
383
* Convert a single ten column CoNLLU line into a CoreLabel
391
384
*/
392
- public CoreLabel convertLineToCoreLabel (CoNLLUSentence sentence , String line ) {
385
+ public CoreLabel convertLineToCoreLabel (CoNLLUSentence sentence , String line , int sentenceIdx ) {
393
386
List <String > fields = Arrays .asList (line .split ("\t " ));
394
387
CoreLabel cl = new CoreLabel ();
388
+ cl .set (CoreAnnotations .SentenceIndexAnnotation .class , sentenceIdx );
395
389
396
390
String indexField = fields .get (CoNLLU_IndexField );
397
391
int sentenceTokenIndex ;
@@ -522,12 +516,12 @@ public CoreLabel convertLineToCoreLabel(CoNLLUSentence sentence, String line) {
522
516
/**
523
517
* Convert a list of CoNLL-U token lines into a sentence CoreMap
524
518
**/
525
- public CoreMap convertCoNLLUSentenceToCoreMap (CoNLLUDocument doc , CoNLLUSentence sentence ) {
519
+ public CoreMap convertCoNLLUSentenceToCoreMap (CoNLLUDocument doc , CoNLLUSentence sentence , int sentenceIdx ) {
526
520
List <String > lines = sentence .tokenLines ;
527
521
// create CoreLabels
528
522
List <CoreLabel > coreLabels = new ArrayList <CoreLabel >();
529
523
for (String line : lines ) {
530
- CoreLabel cl = convertLineToCoreLabel (sentence , line );
524
+ CoreLabel cl = convertLineToCoreLabel (sentence , line , sentenceIdx );
531
525
coreLabels .add (cl );
532
526
}
533
527
for (int i = 1 ; i < coreLabels .size () ; i ++) {
@@ -570,7 +564,7 @@ public CoreMap convertCoNLLUSentenceToCoreMap(CoNLLUDocument doc, CoNLLUSentence
570
564
571
565
List <CoreLabel > emptyLabels = new ArrayList <CoreLabel >();
572
566
for (String line : sentence .emptyLines ) {
573
- CoreLabel cl = convertLineToCoreLabel (sentence , line );
567
+ CoreLabel cl = convertLineToCoreLabel (sentence , line , sentenceIdx );
574
568
emptyLabels .add (cl );
575
569
}
576
570
0 commit comments