Skip to content

Commit 4b427da

Browse files
committed
Disallow unbalanced bidirectional unicode
Disallow unbalanced Unicode bidirectional formatting directives within strings and comments, to mitigate the "trojan source" vulnerability https://www.trojansource.codes See also JuliaLang/julia#42918
1 parent 6e3782f commit 4b427da

File tree

8 files changed

+247
-49
lines changed

8 files changed

+247
-49
lines changed

src/kinds.jl

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@ const _kind_names =
2626
"ErrorInvalidUTF8"
2727
"ErrorInvisibleChar"
2828
"ErrorUnknownCharacter"
29+
"ErrorBidiFormatting"
2930
# Generic error
3031
"error"
3132
"END_ERRORS"
@@ -1049,6 +1050,7 @@ const _nonunique_kind_names = Set([
10491050
K"ErrorInvalidUTF8"
10501051
K"ErrorInvisibleChar"
10511052
K"ErrorUnknownCharacter"
1053+
K"ErrorBidiFormatting"
10521054
K"ErrorInvalidOperator"
10531055

10541056
K"Integer"
@@ -1098,6 +1100,7 @@ const _token_error_descriptions = Dict{Kind, String}(
10981100
K"ErrorInvalidUTF8"=>"invalid UTF-8 character",
10991101
K"ErrorInvisibleChar"=>"invisible character",
11001102
K"ErrorUnknownCharacter"=>"unknown unicode character",
1103+
K"ErrorBidiFormatting"=>"unbalanced bidirectional unicode formatting",
11011104
K"ErrorInvalidOperator" => "invalid operator",
11021105
K"Error**" => "use `x^y` instead of `x**y` for exponentiation, and `x...` instead of `**x` for splatting",
11031106
K"error" => "unknown error token",

src/parse_stream.jl

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -949,6 +949,8 @@ function validate_tokens(stream::ParseStream)
949949
# Emit messages for non-generic token errors
950950
msg = if k in KSet"ErrorInvalidUTF8 ErrorInvisibleChar ErrorUnknownCharacter"
951951
"$(_token_error_descriptions[k]) $(repr(text[fbyte]))"
952+
elseif k == K"ErrorBidiFormatting"
953+
"$(_token_error_descriptions[k]) $(repr(text[fbyte:prevind(text, nbyte)]))"
952954
else
953955
_token_error_descriptions[k]
954956
end

src/parser.jl

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3282,6 +3282,9 @@ function parse_string(ps::ParseState, raw::Bool)
32823282
first_chunk = false
32833283
n_valid_chunks += 1
32843284
end
3285+
elseif k == K"ErrorInvalidInterpolationTerminator" || k == K"ErrorBidiFormatting"
3286+
# Treat these errors as string chunks
3287+
bump(ps)
32853288
else
32863289
break
32873290
end
@@ -3381,6 +3384,8 @@ function parse_atom(ps::ParseState, check_identifiers=true)
33813384
else
33823385
if k == K"Char"
33833386
bump(ps)
3387+
elseif is_error(k)
3388+
bump(ps)
33843389
else
33853390
# FIXME: This case is actually a tokenization error.
33863391
# Make a best-effort attempt to workaround this for now by

src/tokenize.jl

Lines changed: 72 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@ module Tokenize
22

33
export tokenize, untokenize, Tokens
44

5-
using ..JuliaSyntax: JuliaSyntax, Kind, @K_str
5+
using ..JuliaSyntax: JuliaSyntax, Kind, @K_str, @KSet_str
66

77
import ..JuliaSyntax: kind,
88
is_literal, is_error, is_contextual_keyword, is_word_operator
@@ -382,9 +382,6 @@ end
382382

383383
Returns the next character and increments the current position.
384384
"""
385-
function readchar end
386-
387-
388385
function readchar(l::Lexer)
389386
c = readchar(l.io)
390387
l.chars = (l.chars[2], l.chars[3], l.chars[4], c)
@@ -446,17 +443,6 @@ function emit(l::Lexer, kind::Kind, maybe_op=true)
446443
return tok
447444
end
448445

449-
"""
450-
emit_error(l::Lexer, err::Kind)
451-
452-
Returns an `K"error"` token with error `err` and starts a new `RawToken`.
453-
"""
454-
function emit_error(l::Lexer, err::Kind)
455-
@assert is_error(err)
456-
return emit(l, err)
457-
end
458-
459-
460446
"""
461447
next_token(l::Lexer)
462448

@@ -551,20 +537,43 @@ function _next_token(l::Lexer, c)
551537
elseif (k = get(_unicode_ops, c, K"error")) != K"error"
552538
return emit(l, k)
553539
else
554-
emit_error(l,
540+
emit(l,
555541
!isvalid(c) ? K"ErrorInvalidUTF8" :
556542
is_invisible_char(c) ? K"ErrorInvisibleChar" :
557543
K"ErrorUnknownCharacter")
558544
end
559545
end
560546

547+
# UAX #9: Unicode Bidirectional Algorithm
548+
# https://unicode.org/reports/tr9/
549+
# Very partial implementation - just enough to check correct nesting in strings
550+
# and multiline comments.
551+
function update_bidi_state((embedding_nesting, isolate_nesting), c)
552+
if c == '\n'
553+
embedding_nesting = 0
554+
isolate_nesting = 0
555+
elseif c == '\U202A' || c == '\U202B' || c == '\U202D' || c == '\U202E' # LRE RLE LRO RLO
556+
embedding_nesting += 1
557+
elseif c == '\U202C' # PDF
558+
embedding_nesting -= 1
559+
elseif c == '\U2066' || c == '\U2067' || c == '\U2068' # LRI RLI FSI
560+
isolate_nesting += 1
561+
elseif c == '\U2069' # PDI
562+
isolate_nesting -= 1
563+
end
564+
return (embedding_nesting, isolate_nesting)
565+
end
566+
561567
# We're inside a string; possibly reading the string characters, or maybe in
562568
# Julia code within an interpolation.
563569
function lex_string_chunk(l)
564570
state = last(l.string_states)
565571
if state.paren_depth > 0
566572
# Read normal Julia code inside an interpolation but track nesting of
567573
# parentheses.
574+
# TODO: This stateful tracking should probably, somehow, be done by the
575+
# parser instead? Especially for recovery of unbalanced parens inside
576+
# interpolations?
568577
c = readchar(l)
569578
if c == '('
570579
l.string_states[end] = StringState(state.triplestr, state.raw, state.delim,
@@ -598,7 +607,7 @@ function lex_string_chunk(l)
598607
# Only allow certain characters after interpolated vars
599608
# https://github.com/JuliaLang/julia/pull/25234
600609
readchar(l)
601-
return emit_error(l, K"ErrorInvalidInterpolationTerminator")
610+
return emit(l, K"ErrorInvalidInterpolationTerminator")
602611
end
603612
if pc == EOF_CHAR
604613
return emit(l, K"EndMarker")
@@ -637,6 +646,8 @@ function lex_string_chunk(l)
637646
end
638647
end
639648
# Read a chunk of string characters
649+
init_bidi_state = (0,0)
650+
bidi_state = init_bidi_state
640651
if state.raw
641652
# Raw strings treat all characters as literals with the exception that
642653
# the closing quotes can be escaped with an odd number of \ characters.
@@ -647,7 +658,10 @@ function lex_string_chunk(l)
647658
elseif state.triplestr && (pc == '\n' || pc == '\r')
648659
# triple quoted newline splitting
649660
readchar(l)
650-
if pc == '\r' && peekchar(l) == '\n'
661+
if pc == '\n'
662+
bidi_state = init_bidi_state
663+
elseif pc == '\r' && peekchar(l) == '\n'
664+
bidi_state = init_bidi_state
651665
readchar(l)
652666
end
653667
break
@@ -663,6 +677,7 @@ function lex_string_chunk(l)
663677
readchar(l)
664678
end
665679
end
680+
bidi_state = update_bidi_state(bidi_state, c)
666681
end
667682
else
668683
while true
@@ -672,29 +687,39 @@ function lex_string_chunk(l)
672687
elseif state.triplestr && (pc == '\n' || pc == '\r')
673688
# triple quoted newline splitting
674689
readchar(l)
675-
if pc == '\r' && peekchar(l) == '\n'
690+
if pc == '\n'
691+
bidi_state = init_bidi_state
692+
elseif pc == '\r' && peekchar(l) == '\n'
676693
readchar(l)
694+
bidi_state = init_bidi_state
677695
end
678696
break
679697
elseif pc == state.delim && string_terminates(l, state.delim, state.triplestr)
680698
break
681699
elseif pc == '\\'
682700
# Escaped newline
683-
pc2 = dpeekchar(l)[2]
701+
_, pc2, pc3 = peekchar3(l)
684702
if pc2 == '\r' || pc2 == '\n'
703+
if pc2 == '\n' || pc3 == '\n'
704+
bidi_state = init_bidi_state
705+
end
685706
break
686707
end
687708
end
688709
c = readchar(l)
689710
if c == '\\'
690711
c = readchar(l)
691712
c == EOF_CHAR && break
692-
continue
693713
end
714+
bidi_state = update_bidi_state(bidi_state, c)
694715
end
695716
end
696-
return emit(l, state.delim == '"' ? K"String" :
697-
state.delim == '`' ? K"CmdString" : K"Char")
717+
outk = state.delim == '\'' ? K"Char" :
718+
bidi_state != init_bidi_state ? K"ErrorBidiFormatting" :
719+
state.delim == '"' ? K"String" :
720+
state.delim == '`' ? K"CmdString" :
721+
(@assert(state.delim in KSet"' \" `"); K"error")
722+
return emit(l, outk)
698723
end
699724

700725
# Lex whitespace, a whitespace char `c` has been consumed
@@ -725,13 +750,16 @@ function lex_comment(l::Lexer)
725750
end
726751
else
727752
c = readchar(l) # consume the '='
753+
init_bidi_state = (0,0)
754+
bidi_state = init_bidi_state
728755
skip = true # true => c was part of the prev comment marker pair
729756
nesting = 1
730757
while true
731758
if c == EOF_CHAR
732-
return emit_error(l, K"ErrorEofMultiComment")
759+
return emit(l, K"ErrorEofMultiComment")
733760
end
734761
nc = readchar(l)
762+
bidi_state = update_bidi_state(bidi_state, nc)
735763
if skip
736764
skip = false
737765
else
@@ -742,7 +770,9 @@ function lex_comment(l::Lexer)
742770
nesting -= 1
743771
skip = true
744772
if nesting == 0
745-
return emit(l, K"Comment")
773+
outk = bidi_state == init_bidi_state ?
774+
K"Comment" : K"ErrorBidiFormatting"
775+
return emit(l, outk)
746776
end
747777
end
748778
end
@@ -791,12 +821,12 @@ function lex_less(l::Lexer)
791821
elseif dpeekchar(l) == ('-', '-')
792822
readchar(l); readchar(l)
793823
if accept(l, '-')
794-
return emit_error(l, K"ErrorInvalidOperator")
824+
return emit(l, K"ErrorInvalidOperator")
795825
else
796826
if accept(l, '>')
797827
return emit(l, K"<-->")
798828
elseif accept(l, '-')
799-
return emit_error(l, K"ErrorInvalidOperator")
829+
return emit(l, K"ErrorInvalidOperator")
800830
else
801831
return emit(l, K"<--")
802832
end
@@ -879,7 +909,7 @@ function lex_minus(l::Lexer)
879909
if accept(l, '>')
880910
return emit(l, K"-->")
881911
else
882-
return emit_error(l, K"ErrorInvalidOperator") # "--" is an invalid operator
912+
return emit(l, K"ErrorInvalidOperator") # "--" is an invalid operator
883913
end
884914
elseif !l.dotop && accept(l, '>')
885915
return emit(l, K"->")
@@ -891,7 +921,7 @@ end
891921

892922
function lex_star(l::Lexer)
893923
if accept(l, '*')
894-
return emit_error(l, K"Error**") # "**" is an invalid operator use ^
924+
return emit(l, K"Error**") # "**" is an invalid operator use ^
895925
elseif accept(l, '=')
896926
return emit(l, K"*=")
897927
end
@@ -952,15 +982,15 @@ function lex_digit(l::Lexer, kind)
952982
elseif kind === K"Float"
953983
# If we enter the function with kind == K"Float" then a '.' has been parsed.
954984
readchar(l)
955-
return emit_error(l, K"ErrorInvalidNumericConstant")
985+
return emit(l, K"ErrorInvalidNumericConstant")
956986
elseif is_dottable_operator_start_char(ppc)
957987
readchar(l)
958-
return emit_error(l, K"ErrorAmbiguousNumericConstant") # `1.+`
988+
return emit(l, K"ErrorAmbiguousNumericConstant") # `1.+`
959989
end
960990
readchar(l)
961991

962992
kind = K"Float"
963-
accept(l, '_') && return emit_error(l, K"ErrorInvalidNumericConstant") # `1._`
993+
accept(l, '_') && return emit(l, K"ErrorInvalidNumericConstant") # `1._`
964994
had_fraction_digs = accept_number(l, isdigit)
965995
pc, ppc = dpeekchar(l)
966996
if (pc == 'e' || pc == 'E' || pc == 'f') && (isdigit(ppc) || ppc == '+' || ppc == '-' || ppc == '−')
@@ -971,18 +1001,18 @@ function lex_digit(l::Lexer, kind)
9711001
pc,ppc = dpeekchar(l)
9721002
if pc === '.' && !is_dottable_operator_start_char(ppc)
9731003
readchar(l)
974-
return emit_error(l, K"ErrorInvalidNumericConstant") # `1.e1.`
1004+
return emit(l, K"ErrorInvalidNumericConstant") # `1.e1.`
9751005
end
9761006
else
977-
return emit_error(l, K"ErrorInvalidNumericConstant") # `1.e`
1007+
return emit(l, K"ErrorInvalidNumericConstant") # `1.e`
9781008
end
9791009
elseif pc == '.' && ppc != '.' && !is_dottable_operator_start_char(ppc)
9801010
readchar(l)
981-
return emit_error(l, K"ErrorInvalidNumericConstant") # `1.1.`
1011+
return emit(l, K"ErrorInvalidNumericConstant") # `1.1.`
9821012
elseif !had_fraction_digs && (is_identifier_start_char(pc) ||
9831013
pc == '(' || pc == '[' || pc == '{' ||
9841014
pc == '@' || pc == '`' || pc == '"')
985-
return emit_error(l, K"ErrorAmbiguousNumericDotMultiply") # `1.(` `1.x`
1015+
return emit(l, K"ErrorAmbiguousNumericDotMultiply") # `1.(` `1.x`
9861016
end
9871017
elseif (pc == 'e' || pc == 'E' || pc == 'f') && (isdigit(ppc) || ppc == '+' || ppc == '-' || ppc == '−')
9881018
kind = pc == 'f' ? K"Float32" : K"Float"
@@ -992,10 +1022,10 @@ function lex_digit(l::Lexer, kind)
9921022
pc,ppc = dpeekchar(l)
9931023
if pc === '.' && !is_dottable_operator_start_char(ppc)
9941024
accept(l, '.')
995-
return emit_error(l, K"ErrorInvalidNumericConstant") # `1e1.`
1025+
return emit(l, K"ErrorInvalidNumericConstant") # `1e1.`
9961026
end
9971027
else
998-
return emit_error(l, K"ErrorInvalidNumericConstant") # `1e+`
1028+
return emit(l, K"ErrorInvalidNumericConstant") # `1e+`
9991029
end
10001030
elseif position(l) - startpos(l) == 1 && l.chars[1] == '0'
10011031
kind == K"Integer"
@@ -1015,10 +1045,10 @@ function lex_digit(l::Lexer, kind)
10151045
kind = K"Float"
10161046
accept(l, "+-−")
10171047
if !accept_number(l, isdigit) || !had_digits
1018-
return emit_error(l, K"ErrorInvalidNumericConstant") # `0x1p` `0x.p0`
1048+
return emit(l, K"ErrorInvalidNumericConstant") # `0x1p` `0x.p0`
10191049
end
10201050
elseif isfloat
1021-
return emit_error(l, K"ErrorHexFloatMustContainP") # `0x.` `0x1.0`
1051+
return emit(l, K"ErrorHexFloatMustContainP") # `0x.` `0x1.0`
10221052
end
10231053
is_bin_oct_hex_int = !isfloat
10241054
elseif pc == 'b'
@@ -1038,7 +1068,7 @@ function lex_digit(l::Lexer, kind)
10381068
accept_batch(l, c->isdigit(c) || is_identifier_start_char(c))
10391069
# `0x` `0xg` `0x_` `0x-`
10401070
# `0b123` `0o78p` `0xenomorph` `0xaα`
1041-
return emit_error(l, K"ErrorInvalidNumericConstant")
1071+
return emit(l, K"ErrorInvalidNumericConstant")
10421072
end
10431073
end
10441074
end
@@ -1132,7 +1162,7 @@ function lex_dot(l::Lexer)
11321162
else
11331163
if is_dottable_operator_start_char(peekchar(l))
11341164
readchar(l)
1135-
return emit_error(l, K"ErrorInvalidOperator")
1165+
return emit(l, K"ErrorInvalidOperator")
11361166
else
11371167
return emit(l, K"..")
11381168
end

test/diagnostics.jl

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,13 @@ end
1919
Diagnostic(2, 1+sizeof(string(c)), :error, "invisible character $(repr(c))")
2020
end
2121
@test diagnostic(":⥻") == Diagnostic(2, 4, :error, "unknown unicode character '⥻'")
22+
23+
@test diagnostic("\"X \u202a X\"") == Diagnostic(2, 8, :error, "unbalanced bidirectional unicode formatting \"X \\u202a X\"")
24+
@test diagnostic("#= \u202a =#") == Diagnostic(1, 9, :error, "unbalanced bidirectional unicode formatting \"#= \\u202a =#\"")
25+
@test diagnostic("\"X \u202a \$xx\u202c\"", allow_multiple=true) == [
26+
Diagnostic(2, 7, :error, "unbalanced bidirectional unicode formatting \"X \\u202a \"")
27+
Diagnostic(11, 13, :error, "unbalanced bidirectional unicode formatting \"\\u202c\"")
28+
]
2229
end
2330

2431
@testset "parser errors" begin

0 commit comments

Comments
 (0)