36
36
/**
37
37
* Http URI.
38
38
* Parse an HTTP URI from a string or byte array. Given a URI
39
- * <code>http://user@host:port/path/ info;param ?query#fragment</code>
40
- * this class will split it into the following undecoded optional elements:<ul>
39
+ * <code>http://user@host:port/path;param1/%2e/ info;param2 ?query#fragment</code>
40
+ * this class will split it into the following optional elements:<ul>
41
41
* <li>{@link #getScheme()} - http:</li>
42
42
* <li>{@link #getAuthority()} - //name@host:port</li>
43
43
* <li>{@link #getHost()} - host</li>
44
44
* <li>{@link #getPort()} - port</li>
45
- * <li>{@link #getPath()} - /path/info</li>
46
- * <li>{@link #getParam()} - param</li>
45
+ * <li>{@link #getPath()} - /path;param1/%2e/info;param2</li>
46
+ * <li>{@link #getDecodedPath()} - /path/info</li>
47
+ * <li>{@link #getParam()} - param2</li>
47
48
* <li>{@link #getQuery()} - query</li>
48
49
* <li>{@link #getFragment()} - fragment</li>
49
50
* </ul>
50
51
*
51
- * <p>Any parameters will be returned from {@link #getPath()}, but are excluded from the
52
- * return value of {@link #getDecodedPath()}. If there are multiple parameters, the
53
- * {@link #getParam()} method returns only the last one.
54
- */
52
+ * <p>The path part of the URI is provided in both raw form ({@link #getPath()}) and
53
+ * decoded form ({@link #getDecodedPath}), which has: path parameters removed,
54
+ * percent encoded characters expanded and relative segments resolved. This approach
55
+ * is somewhat contrary to <a href="https://tools.ietf.org/html/rfc3986#section-3.3">RFC3986</a>
56
+ * which no longer defines path parameters (removed after
57
+ * <a href="https://tools.ietf.org/html/rfc2396#section-3.3">RFC2396</a>) and specifies
58
+ * that relative segment normalization should take place before percent encoded character
59
+ * expansion. A literal interpretation of the RFC can result in URI paths with ambiguities
60
+ * when viewed as strings. For example, a URI of {@code /foo%2f..%2fbar} is technically a single
61
+ * segment of "/foo/../bar", but could easily be misinterpreted as 3 segments resolving to "/bar"
62
+ * by a file system.
63
+ * </p>
64
+ * <p>
65
+ * Thus this class avoid and/or detects such ambiguities. Furthermore, by decoding characters and
66
+ * removing parameters before relative path normalization, ambiguous paths will be resolved in such
67
+ * a way to be non-standard-but-non-ambiguous to down stream interpretation of the decoded path string.
68
+ * The violations are recorded and available by API such as {@link #hasViolation(Violation)} so that requests
69
+ * containing them may be rejected in case the non-standard-but-non-ambiguous interpretations
70
+ * are not satisfactory for a given compliance configuration. Implementations that wish to
71
+ * process ambiguous URI paths must configure the compliance modes to accept them and then perform
72
+ * their own decoding of {@link #getPath()}.
73
+ * </p>
74
+ * <p>
75
+ * If there are multiple path parameters, only the last one is returned by {@link #getParam()}.
76
+ * </p>
77
+ **/
55
78
public class HttpURI
56
79
{
57
80
private enum State
@@ -69,28 +92,49 @@ private enum State
69
92
ASTERISK
70
93
}
71
94
72
- enum Violation
95
+ /**
96
+ * Violations of safe URI interpretations
97
+ */
98
+ public enum Violation
73
99
{
74
- SEGMENT ,
75
- SEPARATOR ,
76
- PARAM ,
77
- ENCODING ,
78
- EMPTY ,
79
- UTF16
100
+ /**
101
+ * Ambiguous path segments e.g. <code>/foo/%2E%2E/bar</code>
102
+ */
103
+ SEGMENT ("Ambiguous path segments" ),
104
+ /**
105
+ * Ambiguous path separator within a URI segment e.g. <code>/foo%2Fbar</code>
106
+ */
107
+ SEPARATOR ("Ambiguous path separator" ),
108
+ /**
109
+ * Ambiguous path parameters within a URI segment e.g. <code>/foo/..;/bar</code>
110
+ */
111
+ PARAM ("Ambiguous path parameters" ),
112
+ /**
113
+ * Ambiguous double encoding within a URI segment e.g. <code>/%2557EB-INF</code>
114
+ */
115
+ ENCODING ("Ambiguous double encoding" ),
116
+ /**
117
+ * Ambiguous empty segments e.g. <code>/foo//bar</code>
118
+ */
119
+ EMPTY ("Ambiguous empty segments" ),
120
+ /**
121
+ * Non standard UTF-16 encoding eg <code>/foo%u2192bar</code>.
122
+ */
123
+ UTF16 ("Non standard UTF-16 encoding" );
124
+
125
+ private final String _message ;
126
+
127
+ Violation (String message )
128
+ {
129
+ _message = message ;
130
+ }
131
+
132
+ String getMessage ()
133
+ {
134
+ return _message ;
135
+ }
80
136
}
81
137
82
- /**
83
- * The concept of URI path parameters was originally specified in
84
- * <a href="https://tools.ietf.org/html/rfc2396#section-3.3">RFC2396</a>, but that was
85
- * obsoleted by
86
- * <a href="https://tools.ietf.org/html/rfc3986#section-3.3">RFC3986</a> which removed
87
- * a normative definition of path parameters. Specifically it excluded them from the
88
- * <a href="https://tools.ietf.org/html/rfc3986#section-5.2.4">Remove Dot Segments</a>
89
- * algorithm. This results in some ambiguity as dot segments can result from later
90
- * parameter removal or % encoding expansion, that are not removed from the URI
91
- * by {@link URIUtil#canonicalPath(String)}. Thus this class flags such ambiguous
92
- * path segments, so that they may be rejected by the server if so configured.
93
- */
94
138
private static final Trie <Boolean > __ambiguousSegments = new ArrayTrie <>();
95
139
96
140
static
@@ -179,6 +223,22 @@ public HttpURI(HttpURI uri)
179
223
_emptySegment = false ;
180
224
}
181
225
226
+ public HttpURI (HttpURI schemeHostPort , HttpURI uri )
227
+ {
228
+ _scheme = schemeHostPort ._scheme ;
229
+ _user = schemeHostPort ._user ;
230
+ _host = schemeHostPort ._host ;
231
+ _port = schemeHostPort ._port ;
232
+ _path = uri ._path ;
233
+ _param = uri ._param ;
234
+ _query = uri ._query ;
235
+ _fragment = uri ._fragment ;
236
+ _uri = uri ._uri ;
237
+ _decodedPath = uri ._decodedPath ;
238
+ _violations .addAll (uri ._violations );
239
+ _emptySegment = false ;
240
+ }
241
+
182
242
public HttpURI (String uri )
183
243
{
184
244
_port = -1 ;
@@ -506,6 +566,8 @@ else if (c == '/')
506
566
{
507
567
switch (encodedValue )
508
568
{
569
+ case 0 :
570
+ throw new IllegalArgumentException ("Illegal character in path" );
509
571
case '/' :
510
572
_violations .add (Violation .SEPARATOR );
511
573
break ;
@@ -677,10 +739,12 @@ else if (c == '/')
677
739
}
678
740
else if (_path != null )
679
741
{
680
- String canonical = URIUtil .canonicalPath (_path );
681
- if (canonical == null )
682
- throw new BadMessageException ("Bad URI" );
683
- _decodedPath = URIUtil .decodePath (canonical );
742
+ // The RFC requires this to be canonical before decoding, but this can leave dot segments and dot dot segments
743
+ // which are not canonicalized and could be used in an attempt to bypass security checks.
744
+ String decodeNonCanonical = URIUtil .decodePath (_path );
745
+ _decodedPath = URIUtil .canonicalPath (decodeNonCanonical );
746
+ if (_decodedPath == null )
747
+ throw new IllegalArgumentException ("Bad URI" );
684
748
}
685
749
}
686
750
@@ -794,6 +858,11 @@ public boolean hasViolations()
794
858
return !_violations .isEmpty ();
795
859
}
796
860
861
+ public boolean hasViolation (Violation violation )
862
+ {
863
+ return _violations .contains (violation );
864
+ }
865
+
797
866
/**
798
867
* @return True if the URI encodes UTF-16 characters with '%u'.
799
868
*/
@@ -839,6 +908,11 @@ public String getDecodedPath()
839
908
return _decodedPath ;
840
909
}
841
910
911
+ /**
912
+ * Get a URI path parameter. Multiple and in segment parameters are ignored and only
913
+ * the last trailing parameter is returned.
914
+ * @return The last path parameter or null
915
+ */
842
916
public String getParam ()
843
917
{
844
918
return _param ;
0 commit comments