From 77325ad05c18876b3095c31fa0f5d8f6a91c7e53 Mon Sep 17 00:00:00 2001 From: Brian Neradt Date: Tue, 18 Jul 2023 22:23:10 +0000 Subject: [PATCH] Correctly handle encoding for cache hash generation Since origins may treat URL encoded or unencoded paths, query parameters, or fragments differently, we should cache them separately. This updates our URL cache hashing logic to not unencode these components of a URI. --- proxy/hdrs/URL.cc | 10 +++ proxy/hdrs/unit_tests/test_URL.cc | 127 +++++++++++++++++++++++++++ tests/gold_tests/url/uri.replay.yaml | 111 +++++++++++++++++++++-- tests/gold_tests/url/uri.test.py | 8 +- 4 files changed, 248 insertions(+), 8 deletions(-) diff --git a/proxy/hdrs/URL.cc b/proxy/hdrs/URL.cc index 63aedcccb8f..0cea33ac9ee 100644 --- a/proxy/hdrs/URL.cc +++ b/proxy/hdrs/URL.cc @@ -1859,6 +1859,16 @@ url_CryptoHash_get_general(const URLImpl *url, CryptoContext &ctx, CryptoHash &h while (t < ends[i]) { if ((i == 0) || (i == 6)) { // scheme and host unescape_str_tolower(p, e, t, ends[i], s); + } else if (i == 8 || i == 10 || i == 12) { // path, params, query + // Don't unescape the parts of the URI that are processed by the + // origin since it may behave differently based upon whether these are + // escaped or not. Therefore differently encoded strings should be + // cached separately via differentiated hashes. + int path_len = ends[i] - t; + int min_len = std::min(path_len, static_cast(e - p)); + memcpy(p, t, min_len); + p += min_len; + t += min_len; } else { unescape_str(p, e, t, ends[i], s); } diff --git a/proxy/hdrs/unit_tests/test_URL.cc b/proxy/hdrs/unit_tests/test_URL.cc index bb1886d5800..a5741839ef7 100644 --- a/proxy/hdrs/unit_tests/test_URL.cc +++ b/proxy/hdrs/unit_tests/test_URL.cc @@ -23,6 +23,7 @@ #include "catch.hpp" #include "URL.h" +#include "tscore/CryptoHash.h" TEST_CASE("ValidateURL", "[proxy][validurl]") { @@ -562,3 +563,129 @@ TEST_CASE("UrlParse", "[proxy][parseurl]") test_parse(test_case, URL_PARSE_REGEX); } } + +struct get_hash_test_case { + const std::string description; + const std::string uri_1; + const std::string uri_2; + const bool ignore_query; + const bool has_equal_hash; +}; + +constexpr bool HAS_EQUAL_HASH = true; +constexpr bool IGNORE_QUERY = true; + +// clang-format off +std::vector get_hash_test_cases = { + { + "No encoding: equal hashes", + "http://one.example.com/a/path?name=value#some=value?with_question#fragment", + "http://one.example.com/a/path?name=value#some=value?with_question#fragment", + !IGNORE_QUERY, + HAS_EQUAL_HASH, + }, + { + "Scheme encoded: equal hashes", + "http%3C://one.example.com/a/path?name=value#some=value?with_question#fragment", + "http<://one.example.com/a/path?name=value#some=value?with_question#fragment", + !IGNORE_QUERY, + HAS_EQUAL_HASH, + }, + { + "Host encoded: equal hashes", + "http://one%2Eexample.com/a/path?name=value#some=value?with_question#fragment", + "http://one.example.com/a/path?name=value#some=value?with_question#fragment", + !IGNORE_QUERY, + HAS_EQUAL_HASH, + }, + { + "Path encoded: differing hashes", + "http://one.example.com/a%2Fpath?name=value#some=value?with_question#fragment", + "http://one.example.com/a/path?name=value#some=value?with_question#fragment", + !IGNORE_QUERY, + !HAS_EQUAL_HASH, + }, + { + "Query = encoded: differing hashes", + "http://one.example.com/a/path?name%3Dvalue#some=value?with_question#fragment", + "http://one.example.com/a/path?name=value#some=value?with_question#fragment", + !IGNORE_QUERY, + !HAS_EQUAL_HASH, + }, + { + "Query = encoded but ignore_query: equal hashes", + "http://one.example.com/a/path?name%3Dvalue#some=value?with_question#fragment", + "http://one.example.com/a/path?name=value#some=value?with_question#fragment", + IGNORE_QUERY, + HAS_EQUAL_HASH, + }, + { + "Query internal encoded: differing hashes", + "http://one.example.com/a/path?name=valu%5D#some=value?with_question#fragment", + "http://one.example.com/a/path?name=valu]#some=value?with_question#fragment", + !IGNORE_QUERY, + !HAS_EQUAL_HASH, + }, + { + "Query internal encoded but ignore_query: equal hashes", + "http://one.example.com/a/path?name=valu%5D#some=value?with_question#fragment", + "http://one.example.com/a/path?name=valu]#some=value?with_question#fragment", + IGNORE_QUERY, + HAS_EQUAL_HASH, + }, + { + "Fragment encoded: fragment is not part of the hash", + "http://one.example.com/a/path?name=value#some=value?with_question#frag%7Dent", + "http://one.example.com/a/path?name=value#some=value?with_question/frag}ent", + !IGNORE_QUERY, + HAS_EQUAL_HASH, + }, + { + "Username encoded: equal hashes", + "mysql://my%7Eser:mypassword@localhost/mydatabase", + "mysql://my~ser:mypassword@localhost/mydatabase", + !IGNORE_QUERY, + HAS_EQUAL_HASH, + }, + { + "Password encoded: equal hashes", + "mysql://myuser:mypa%24sword@localhost/mydatabase", + "mysql://myuser:mypa$sword@localhost/mydatabase", + !IGNORE_QUERY, + HAS_EQUAL_HASH, + }, +}; + +/** Return the hash related to a URI. + * + * @param[in] uri The URI to hash. + * @return The hash of the URI. + */ +CryptoHash +get_hash(const std::string &uri, bool ignore_query) +{ + URL url; + HdrHeap *heap = new_HdrHeap(); + url.create(heap); + url.parse(uri); + CryptoHash hash; + url.hash_get(&hash, ignore_query); + heap->destroy(); + return hash; +} + +TEST_CASE("UrlHashGet", "[url][hash_get]") +{ + for (auto const &test_case : get_hash_test_cases) { + std::string description = test_case.description + ": " + test_case.uri_1 + " vs " + test_case.uri_2; + SECTION(description) { + CryptoHash hash1 = get_hash(test_case.uri_1, test_case.ignore_query); + CryptoHash hash2 = get_hash(test_case.uri_2, test_case.ignore_query); + if (test_case.has_equal_hash) { + CHECK(hash1 == hash2); + } else { + CHECK(hash1 != hash2); + } + } + } +} diff --git a/tests/gold_tests/url/uri.replay.yaml b/tests/gold_tests/url/uri.replay.yaml index 4935ff7951e..d6689db11fb 100644 --- a/tests/gold_tests/url/uri.replay.yaml +++ b/tests/gold_tests/url/uri.replay.yaml @@ -33,12 +33,20 @@ meta: - [ Content-Length, 16 ] - [ Cache-Control, max-age=300 ] + - 404_ok_response: &404_ok_response + server-response: + status: 404 + reason: "Not Found" + headers: + fields: + - [ Content-Length, 16 ] + - [ Cache-Control, max-age=300 ] + sessions: - transactions: # A simply path - - all: { headers: { fields: [[ uuid, 1 ]]}} - client-request: + - client-request: method: "GET" version: "1.1" scheme: "http" @@ -46,6 +54,7 @@ sessions: headers: fields: - [ Host, example.com ] + - [ uuid, 1 ] <<: *200_ok_response @@ -54,8 +63,7 @@ sessions: # An initial // without an authority is not valid per RFC 3986 section-3.3, # but we have historically accepted it and will continue to do so. - - all: { headers: { fields: [[ uuid, 2 ]]}} - client-request: + - client-request: method: "GET" version: "1.1" scheme: "http" @@ -63,6 +71,7 @@ sessions: headers: fields: - [ Host, example.com ] + - [ uuid, 2 ] <<: *200_ok_response @@ -70,8 +79,7 @@ sessions: status: 200 # A '^' is an invalid path. - - all: { headers: { fields: [[ uuid, 3 ]]}} - client-request: + - client-request: method: "GET" version: "1.1" scheme: "http" @@ -79,8 +87,99 @@ sessions: headers: fields: - [ Host, example.com ] + - [ uuid, 3 ] <<: *200_ok_response proxy-response: status: 400 + +- transactions: + # + # Make sure that we consistently treat encoding of paths. + # + - client-request: + method: "GET" + version: "1.1" + url: /path%2ffoo + headers: + fields: + - [ Host, test.com ] + - [ Connection, keep-alive ] + - [ uuid, encoded_path ] + + proxy-request: + # Verify that the origin gets the encoded path. + url: /path%2ffoo + + <<: *404_ok_response + + proxy-response: + status: 404 + + # Request with an unencoded path: go to origin. + - client-request: + # Give plenty of time to cache the response. + delay: 200ms + + method: "GET" + version: "1.1" + # Note that the path is not encoded. + url: /path/foo + headers: + fields: + - [ Host, test.com ] + - [ Connection, keep-alive ] + - [ uuid, unencoded_path ] + + proxy-request: + url: /path/foo + + <<: *200_ok_response + + proxy-response: + # ATS should treat the unencoded path as different and go to the origin, + # not serve the encoded path out of cache. + status: 200 + + # Request again with an encoded path, should be served from cache. + - client-request: + method: "GET" + version: "1.1" + url: /path%2ffoo + headers: + fields: + - [ Host, test.com ] + - [ Connection, keep-alive ] + - [ uuid, encoded_path_again ] + + # The origin should not receive this request, but if it does, reply with a + # non-404 response so we can detect it. + <<: *200_ok_response + + proxy-response: + # ATS should serve the encoded path response from cache. + status: 404 + + # Request again with an encoded path, should be served from cache. + - client-request: + # Give plenty of time to cache the unencoded response. + delay: 200ms + + method: "GET" + version: "1.1" + # Note that the path is not encoded. + url: /path/foo + headers: + fields: + - [ Host, test.com ] + - [ Connection, keep-alive ] + - [ uuid, unencoded_path_again ] + + # The origin should not receive this request, but if it does, reply with a + # non-200 response so we can detect it. + <<: *404_ok_response + + proxy-response: + # Verify that ATS returns out of cache for the uneencoded path object. + status: 200 diff --git a/tests/gold_tests/url/uri.test.py b/tests/gold_tests/url/uri.test.py index 8589d07f0f3..f1ccd77e009 100644 --- a/tests/gold_tests/url/uri.test.py +++ b/tests/gold_tests/url/uri.test.py @@ -26,7 +26,7 @@ server = Test.MakeVerifierServerProcess("server", replay_file) ts.Disk.records_config.update({ 'proxy.config.diags.debug.enabled': 1, - 'proxy.config.diags.debug.tags': 'http', + 'proxy.config.diags.debug.tags': 'http|cache|url', }) ts.Disk.remap_config.AddLine( 'map / http://127.0.0.1:{0}'.format(server.Variables.http_port) @@ -34,4 +34,8 @@ tr = Test.AddTestRun("Verify correct URI parsing behavior.") tr.Processes.Default.StartBefore(server) tr.Processes.Default.StartBefore(ts) -tr.AddVerifierClientProcess("client", replay_file, http_ports=[ts.Variables.port]) +tr.AddVerifierClientProcess( + "client", + replay_file, + http_ports=[ts.Variables.port], + other_args='--thread-limit 1')