@@ -2,7 +2,7 @@ module Tokenize
2
2
3
3
export tokenize, untokenize, Tokens
4
4
5
- using ..JuliaSyntax: JuliaSyntax, Kind, @K_str
5
+ using ..JuliaSyntax: JuliaSyntax, Kind, @K_str, @KSet_str
6
6
7
7
import ..JuliaSyntax: kind,
8
8
is_literal, is_error, is_contextual_keyword, is_word_operator
382
382
383
383
Returns the next character and increments the current position.
384
384
"""
385
- function readchar end
386
-
387
-
388
385
function readchar(l::Lexer)
389
386
c = readchar(l.io)
390
387
l.chars = (l.chars[2], l.chars[3], l.chars[4], c)
@@ -446,17 +443,6 @@ function emit(l::Lexer, kind::Kind, maybe_op=true)
446
443
return tok
447
444
end
448
445
449
- """
450
- emit_error(l::Lexer, err::Kind)
451
-
452
- Returns an `K"error"` token with error `err` and starts a new `RawToken`.
453
- """
454
- function emit_error(l::Lexer, err::Kind)
455
- @assert is_error(err)
456
- return emit(l, err)
457
- end
458
-
459
-
460
446
"""
461
447
next_token(l::Lexer)
462
448
@@ -551,20 +537,43 @@ function _next_token(l::Lexer, c)
551
537
elseif (k = get(_unicode_ops, c, K"error")) != K"error"
552
538
return emit(l, k)
553
539
else
554
- emit_error (l,
540
+ emit (l,
555
541
!isvalid(c) ? K"ErrorInvalidUTF8" :
556
542
is_invisible_char(c) ? K"ErrorInvisibleChar" :
557
543
K"ErrorUnknownCharacter")
558
544
end
559
545
end
560
546
547
+ # UAX #9: Unicode Bidirectional Algorithm
548
+ # https://unicode.org/reports/tr9/
549
+ # Very partial implementation - just enough to check correct nesting in strings
550
+ # and multiline comments.
551
+ function update_bidi_state((embedding_nesting, isolate_nesting), c)
552
+ if c == '\n'
553
+ embedding_nesting = 0
554
+ isolate_nesting = 0
555
+ elseif c == '\U202A' || c == '\U202B' || c == '\U202D' || c == '\U202E' # LRE RLE LRO RLO
556
+ embedding_nesting += 1
557
+ elseif c == '\U202C' # PDF
558
+ embedding_nesting -= 1
559
+ elseif c == '\U2066' || c == '\U2067' || c == '\U2068' # LRI RLI FSI
560
+ isolate_nesting += 1
561
+ elseif c == '\U2069' # PDI
562
+ isolate_nesting -= 1
563
+ end
564
+ return (embedding_nesting, isolate_nesting)
565
+ end
566
+
561
567
# We're inside a string; possibly reading the string characters, or maybe in
562
568
# Julia code within an interpolation.
563
569
function lex_string_chunk(l)
564
570
state = last(l.string_states)
565
571
if state.paren_depth > 0
566
572
# Read normal Julia code inside an interpolation but track nesting of
567
573
# parentheses.
574
+ # TODO: This stateful tracking should probably, somehow, be done by the
575
+ # parser instead? Especially for recovery of unbalanced parens inside
576
+ # interpolations?
568
577
c = readchar(l)
569
578
if c == '('
570
579
l.string_states[end] = StringState(state.triplestr, state.raw, state.delim,
@@ -598,7 +607,7 @@ function lex_string_chunk(l)
598
607
# Only allow certain characters after interpolated vars
599
608
# https://github.com/JuliaLang/julia/pull/25234
600
609
readchar(l)
601
- return emit_error (l, K"ErrorInvalidInterpolationTerminator")
610
+ return emit (l, K"ErrorInvalidInterpolationTerminator")
602
611
end
603
612
if pc == EOF_CHAR
604
613
return emit(l, K"EndMarker")
@@ -637,6 +646,8 @@ function lex_string_chunk(l)
637
646
end
638
647
end
639
648
# Read a chunk of string characters
649
+ init_bidi_state = (0,0)
650
+ bidi_state = init_bidi_state
640
651
if state.raw
641
652
# Raw strings treat all characters as literals with the exception that
642
653
# the closing quotes can be escaped with an odd number of \ characters.
@@ -647,7 +658,10 @@ function lex_string_chunk(l)
647
658
elseif state.triplestr && (pc == '\n' || pc == '\r')
648
659
# triple quoted newline splitting
649
660
readchar(l)
650
- if pc == '\r' && peekchar(l) == '\n'
661
+ if pc == '\n'
662
+ bidi_state = init_bidi_state
663
+ elseif pc == '\r' && peekchar(l) == '\n'
664
+ bidi_state = init_bidi_state
651
665
readchar(l)
652
666
end
653
667
break
@@ -663,6 +677,7 @@ function lex_string_chunk(l)
663
677
readchar(l)
664
678
end
665
679
end
680
+ bidi_state = update_bidi_state(bidi_state, c)
666
681
end
667
682
else
668
683
while true
@@ -672,29 +687,39 @@ function lex_string_chunk(l)
672
687
elseif state.triplestr && (pc == '\n' || pc == '\r')
673
688
# triple quoted newline splitting
674
689
readchar(l)
675
- if pc == '\r' && peekchar(l) == '\n'
690
+ if pc == '\n'
691
+ bidi_state = init_bidi_state
692
+ elseif pc == '\r' && peekchar(l) == '\n'
676
693
readchar(l)
694
+ bidi_state = init_bidi_state
677
695
end
678
696
break
679
697
elseif pc == state.delim && string_terminates(l, state.delim, state.triplestr)
680
698
break
681
699
elseif pc == '\\'
682
700
# Escaped newline
683
- pc2 = dpeekchar (l)[2]
701
+ _, pc2, pc3 = peekchar3 (l)
684
702
if pc2 == '\r' || pc2 == '\n'
703
+ if pc2 == '\n' || pc3 == '\n'
704
+ bidi_state = init_bidi_state
705
+ end
685
706
break
686
707
end
687
708
end
688
709
c = readchar(l)
689
710
if c == '\\'
690
711
c = readchar(l)
691
712
c == EOF_CHAR && break
692
- continue
693
713
end
714
+ bidi_state = update_bidi_state(bidi_state, c)
694
715
end
695
716
end
696
- return emit(l, state.delim == '"' ? K"String" :
697
- state.delim == '`' ? K"CmdString" : K"Char")
717
+ outk = state.delim == '\'' ? K"Char" :
718
+ bidi_state != init_bidi_state ? K"ErrorBidiFormatting" :
719
+ state.delim == '"' ? K"String" :
720
+ state.delim == '`' ? K"CmdString" :
721
+ (@assert(state.delim in KSet"' \" `"); K"error")
722
+ return emit(l, outk)
698
723
end
699
724
700
725
# Lex whitespace, a whitespace char `c` has been consumed
@@ -725,13 +750,16 @@ function lex_comment(l::Lexer)
725
750
end
726
751
else
727
752
c = readchar(l) # consume the '='
753
+ init_bidi_state = (0,0)
754
+ bidi_state = init_bidi_state
728
755
skip = true # true => c was part of the prev comment marker pair
729
756
nesting = 1
730
757
while true
731
758
if c == EOF_CHAR
732
- return emit_error (l, K"ErrorEofMultiComment")
759
+ return emit (l, K"ErrorEofMultiComment")
733
760
end
734
761
nc = readchar(l)
762
+ bidi_state = update_bidi_state(bidi_state, nc)
735
763
if skip
736
764
skip = false
737
765
else
@@ -742,7 +770,9 @@ function lex_comment(l::Lexer)
742
770
nesting -= 1
743
771
skip = true
744
772
if nesting == 0
745
- return emit(l, K"Comment")
773
+ outk = bidi_state == init_bidi_state ?
774
+ K"Comment" : K"ErrorBidiFormatting"
775
+ return emit(l, outk)
746
776
end
747
777
end
748
778
end
@@ -791,12 +821,12 @@ function lex_less(l::Lexer)
791
821
elseif dpeekchar(l) == ('-', '-')
792
822
readchar(l); readchar(l)
793
823
if accept(l, '-')
794
- return emit_error (l, K"ErrorInvalidOperator")
824
+ return emit (l, K"ErrorInvalidOperator")
795
825
else
796
826
if accept(l, '>')
797
827
return emit(l, K"<-->")
798
828
elseif accept(l, '-')
799
- return emit_error (l, K"ErrorInvalidOperator")
829
+ return emit (l, K"ErrorInvalidOperator")
800
830
else
801
831
return emit(l, K"<--")
802
832
end
@@ -879,7 +909,7 @@ function lex_minus(l::Lexer)
879
909
if accept(l, '>')
880
910
return emit(l, K"-->")
881
911
else
882
- return emit_error (l, K"ErrorInvalidOperator") # "--" is an invalid operator
912
+ return emit (l, K"ErrorInvalidOperator") # "--" is an invalid operator
883
913
end
884
914
elseif !l.dotop && accept(l, '>')
885
915
return emit(l, K"->")
891
921
892
922
function lex_star(l::Lexer)
893
923
if accept(l, '*')
894
- return emit_error (l, K"Error**") # "**" is an invalid operator use ^
924
+ return emit (l, K"Error**") # "**" is an invalid operator use ^
895
925
elseif accept(l, '=')
896
926
return emit(l, K"*=")
897
927
end
@@ -952,15 +982,15 @@ function lex_digit(l::Lexer, kind)
952
982
elseif kind === K"Float"
953
983
# If we enter the function with kind == K"Float" then a '.' has been parsed.
954
984
readchar(l)
955
- return emit_error (l, K"ErrorInvalidNumericConstant")
985
+ return emit (l, K"ErrorInvalidNumericConstant")
956
986
elseif is_dottable_operator_start_char(ppc)
957
987
readchar(l)
958
- return emit_error (l, K"ErrorAmbiguousNumericConstant") # `1.+`
988
+ return emit (l, K"ErrorAmbiguousNumericConstant") # `1.+`
959
989
end
960
990
readchar(l)
961
991
962
992
kind = K"Float"
963
- accept(l, '_') && return emit_error (l, K"ErrorInvalidNumericConstant") # `1._`
993
+ accept(l, '_') && return emit (l, K"ErrorInvalidNumericConstant") # `1._`
964
994
had_fraction_digs = accept_number(l, isdigit)
965
995
pc, ppc = dpeekchar(l)
966
996
if (pc == 'e' || pc == 'E' || pc == 'f') && (isdigit(ppc) || ppc == '+' || ppc == '-' || ppc == '−')
@@ -971,18 +1001,18 @@ function lex_digit(l::Lexer, kind)
971
1001
pc,ppc = dpeekchar(l)
972
1002
if pc === '.' && !is_dottable_operator_start_char(ppc)
973
1003
readchar(l)
974
- return emit_error (l, K"ErrorInvalidNumericConstant") # `1.e1.`
1004
+ return emit (l, K"ErrorInvalidNumericConstant") # `1.e1.`
975
1005
end
976
1006
else
977
- return emit_error (l, K"ErrorInvalidNumericConstant") # `1.e`
1007
+ return emit (l, K"ErrorInvalidNumericConstant") # `1.e`
978
1008
end
979
1009
elseif pc == '.' && ppc != '.' && !is_dottable_operator_start_char(ppc)
980
1010
readchar(l)
981
- return emit_error (l, K"ErrorInvalidNumericConstant") # `1.1.`
1011
+ return emit (l, K"ErrorInvalidNumericConstant") # `1.1.`
982
1012
elseif !had_fraction_digs && (is_identifier_start_char(pc) ||
983
1013
pc == '(' || pc == '[' || pc == '{' ||
984
1014
pc == '@' || pc == '`' || pc == '"')
985
- return emit_error (l, K"ErrorAmbiguousNumericDotMultiply") # `1.(` `1.x`
1015
+ return emit (l, K"ErrorAmbiguousNumericDotMultiply") # `1.(` `1.x`
986
1016
end
987
1017
elseif (pc == 'e' || pc == 'E' || pc == 'f') && (isdigit(ppc) || ppc == '+' || ppc == '-' || ppc == '−')
988
1018
kind = pc == 'f' ? K"Float32" : K"Float"
@@ -992,10 +1022,10 @@ function lex_digit(l::Lexer, kind)
992
1022
pc,ppc = dpeekchar(l)
993
1023
if pc === '.' && !is_dottable_operator_start_char(ppc)
994
1024
accept(l, '.')
995
- return emit_error (l, K"ErrorInvalidNumericConstant") # `1e1.`
1025
+ return emit (l, K"ErrorInvalidNumericConstant") # `1e1.`
996
1026
end
997
1027
else
998
- return emit_error (l, K"ErrorInvalidNumericConstant") # `1e+`
1028
+ return emit (l, K"ErrorInvalidNumericConstant") # `1e+`
999
1029
end
1000
1030
elseif position(l) - startpos(l) == 1 && l.chars[1] == '0'
1001
1031
kind == K"Integer"
@@ -1015,10 +1045,10 @@ function lex_digit(l::Lexer, kind)
1015
1045
kind = K"Float"
1016
1046
accept(l, "+-−")
1017
1047
if !accept_number(l, isdigit) || !had_digits
1018
- return emit_error (l, K"ErrorInvalidNumericConstant") # `0x1p` `0x.p0`
1048
+ return emit (l, K"ErrorInvalidNumericConstant") # `0x1p` `0x.p0`
1019
1049
end
1020
1050
elseif isfloat
1021
- return emit_error (l, K"ErrorHexFloatMustContainP") # `0x.` `0x1.0`
1051
+ return emit (l, K"ErrorHexFloatMustContainP") # `0x.` `0x1.0`
1022
1052
end
1023
1053
is_bin_oct_hex_int = !isfloat
1024
1054
elseif pc == 'b'
@@ -1038,7 +1068,7 @@ function lex_digit(l::Lexer, kind)
1038
1068
accept_batch(l, c->isdigit(c) || is_identifier_start_char(c))
1039
1069
# `0x` `0xg` `0x_` `0x-`
1040
1070
# `0b123` `0o78p` `0xenomorph` `0xaα`
1041
- return emit_error (l, K"ErrorInvalidNumericConstant")
1071
+ return emit (l, K"ErrorInvalidNumericConstant")
1042
1072
end
1043
1073
end
1044
1074
end
@@ -1132,7 +1162,7 @@ function lex_dot(l::Lexer)
1132
1162
else
1133
1163
if is_dottable_operator_start_char(peekchar(l))
1134
1164
readchar(l)
1135
- return emit_error (l, K"ErrorInvalidOperator")
1165
+ return emit (l, K"ErrorInvalidOperator")
1136
1166
else
1137
1167
return emit(l, K"..")
1138
1168
end
0 commit comments