Fix: Race between manifest object creation and application of tags (#5958)#6003
Conversation
Codecov ReportAttention: Patch coverage is
Additional details and impacted files@@ Coverage Diff @@
## develop #6003 +/- ##
===========================================
- Coverage 85.26% 85.13% -0.13%
===========================================
Files 155 155
Lines 20456 20488 +32
===========================================
+ Hits 17441 17443 +2
- Misses 3015 3045 +30 ☔ View full report in Codecov by Sentry. |
12f5e8f to
6aed74d
Compare
nadove-ucsc
left a comment
There was a problem hiding this comment.
Subject: [PATCH] review
---
Index: test/integration_test.py
IDEA additional info:
Subsystem: com.intellij.openapi.diff.impl.patch.CharsetEP
<+>UTF-8
===================================================================
diff --git a/test/integration_test.py b/test/integration_test.py
--- a/test/integration_test.py (revision 6aed74d7a4ce6f480c5aac8fdeb270327bb018a9)
+++ b/test/integration_test.py (date 1709251288557)
@@ -650,23 +650,20 @@
ManifestFormat.compact: self._check_compact_manifest,
ManifestFormat.curl: self._check_curl_manifest,
}
+ format, validator = self.random.choice([*validators.items()])
filters = self.__init_manifest_filters(catalog)
-
- format = self.random.choice([*validators])
- manifeset_url = config.service_endpoint.set(path='/manifest/files')
+ manifest_url = config.service_endpoint.set(path='/manifest/files',
+ args=dict(catalog=catalog,
+ filters=json.dumps(filters),
+ format=format.value))
with self.subTest('manifest-tagging', catalog=catalog, format=format):
- args = dict(catalog=catalog, filters=json.dumps(filters))
- validator = validators[format]
- args['format'] = format.value
- manifeset_url.args = args
method = PUT
-
- responses = list()
+ responses = []
while True:
- response = self._get_url(method, manifeset_url)
- responses.append(response)
+ response = self._get_url(method, manifest_url)
if response.status == 301:
+ responses.append(response)
# Request the same manifest without following the redirect
# in order to expose the race condition that returns an
# untagged cached manifest. This happens when a step
@@ -674,12 +671,12 @@
# in the process of tagging the object.
time.sleep(2)
elif response.status == 302:
- method, manifeset_url = GET, furl(response.headers['Location'])
+ responses.append(response)
+ method, manifest_url = GET, furl(response.headers['Location'])
else:
break
validator(catalog, response.data)
- responses.remove(response)
execution_ids = self._manifest_execution_ids(responses, fetch=False)
self.assertEqual(1, len(execution_ids))
src/azul/service/manifest_service.py
Outdated
| # Can't be absent under S3's strong consistency | ||
| assert False, (object_key, self.file_name_tag) | ||
| # Paged manifest generators apply the tag after creating | ||
| # the object. If we happened to be right inbetween the |
There was a problem hiding this comment.
| # the object. If we happened to be right inbetween the | |
| # the object. If we happen to be right in between the |
test/integration_test.py
Outdated
| expect_execution = fetch == first_fetch | ||
| self.assertEqual(1 if expect_execution else 0, len(execution_ids)) | ||
|
|
||
| def __init_manifest_filters(self, catalog: CatalogName) -> JSON: |
There was a problem hiding this comment.
Method name is strange. Why two underscores? I can't think anything satisfactorily descriptive that's also reasonably terse, so unless you can think of something better I'd just go with
| def __init_manifest_filters(self, catalog: CatalogName) -> JSON: | |
| def _manifest_filters(self, catalog: CatalogName) -> JSON: |
There was a problem hiding this comment.
Also, commit title in unnecessarily detailed. I think "Refactor IT manifest filters" is sufficient. The motivation is obvious from context.
test/integration_test.py
Outdated
| } | ||
| filters = self.__init_manifest_filters(catalog) | ||
|
|
||
| format = self.random.choice([*validators]) |
There was a problem hiding this comment.
Choosing a format at random seems odd to me. If we gain a meaningful increase in coverage from testing multiple formats, we should always test both, otherwise, pick one so we can simplify the test.
src/azul/service/manifest_service.py
Outdated
| file_name = self.file_name(manifest_key, base_name=partition.file_name) | ||
| tagging = self.tagging(file_name) | ||
| if tagging is not None: | ||
| time.sleep(16) # Exposes race between manifest completion and tagging |
There was a problem hiding this comment.
If this is needed to expose the race condition, why is it in a drop! commit?
There was a problem hiding this comment.
PL:
We'll be using the drop! commit in sandbox IT for testing/validation purposes.
7483662 to
9f6f649
Compare
nadove-ucsc
left a comment
There was a problem hiding this comment.
The drop! commit message claims that it reverts commit 748366, but that commit isn't on this branch. I presume that it's meant to point at b269717 and the reference was broken due to a rebase.
I'm also confused as why the change to 16s is included in this commit, since that line wasn't changed in the commit that it appears to be reverting. Normally, if commit B claims to revert commit A, then I expect squashing A and B to result in an empty diff, but that doesn't seem to be the case here. Perhaps you could have two separate drop! commits, one to revert the fix and one to change the sleep time? But I might be overthinking this.
test/integration_test.py
Outdated
| ManifestFormat.curl: self._check_curl_manifest, | ||
| } | ||
|
|
||
| for format, validator in validators: |
There was a problem hiding this comment.
Did you test this? I think it needs to be:
| for format, validator in validators: | |
| for format, validator in validators.items(): |
test/integration_test.py
Outdated
| } | ||
|
|
||
| def _test_manifest_tagging_race(self, catalog: CatalogName): | ||
| validators: dict[ManifestFormat, Callable[[str, bytes], None]] = { |
There was a problem hiding this comment.
| validators: dict[ManifestFormat, Callable[[str, bytes], None]] = { | |
| validators: dict[ManifestFormat, Callable[[CatalogName, bytes], None]] = { |
1c4ade6 to
509d2ba
Compare
Yeah, I was trying to keep the changes necessary to expose the race in a single |
0856bec to
75b4b72
Compare
|
For the next round, also add service logs with 500. |
6933dfb to
e6df3a6
Compare
e6df3a6 to
216047b
Compare
|
As confirmed during PL, this is the expected failure since we end up using a request method Lines 853 to 855 in 3ddf36e And as describe in issue #5958, this issue exhibits itself by returning a 500 status response from the service, when present. When the |
hannes-ucsc
left a comment
There was a problem hiding this comment.
Index: test/integration_test.py
IDEA additional info:
Subsystem: com.intellij.openapi.diff.impl.patch.CharsetEP
<+>UTF-8
===================================================================
diff --git a/test/integration_test.py b/test/integration_test.py
--- a/test/integration_test.py (revision 216047b1f0dc8f2f844979b8bb9292058e8d3b70)
+++ b/test/integration_test.py (date 1712263094671)
@@ -648,45 +648,44 @@
def _test_manifest_tagging_race(self, catalog: CatalogName):
validators: dict[ManifestFormat, Callable[[CatalogName, bytes], None]] = {
- ManifestFormat.compact: self._check_compact_manifest
+ ManifestFormat.compact: self._check_compact_manifest,
+ ManifestFormat.curl: self._check_curl_manifest
}
- # Test curl manifest format only in plugins that use it
- if ManifestFormat.curl in self.metadata_plugin(catalog).manifest_formats:
- validators[ManifestFormat.curl] = self._check_curl_manifest
-
+ plugin = self.metadata_plugin(catalog)
for format, validator in validators.items():
- with self.subTest('manifest_tagging_race', catalog=catalog, format=format):
- filters = self._manifest_filters(catalog)
- manifest_url = config.service_endpoint.set(path='/manifest/files',
- args=dict(catalog=catalog,
- filters=json.dumps(filters),
- format=format.value))
- method = PUT
- responses = []
- while True:
- response = self._get_url(method, manifest_url)
- if response.status == 301:
- responses.append(response)
- # Request the same manifest without following the
- # redirect in order to expose the race condition that
- # returns an untagged cached manifest. This happens
- # when a step function execution has generated a
- # manifest but is still in the process of tagging the
- # object. The more requests we make, the more likely it
- # is that we catch the execution in this racy state. We
- # still have to throttle the requests in order to
- # prevent tripping the WAF rate limit.
- rate_limit = config.waf_rate_rule_period / config.waf_rate_rule_limit
- time.sleep(rate_limit)
- elif response.status == 302:
- responses.append(response)
- method, manifest_url = GET, furl(response.headers['Location'])
- else:
- break
+ if format in plugin.manifest_formats:
+ with self.subTest('manifest_tagging_race', catalog=catalog, format=format):
+ filters = self._manifest_filters(catalog)
+ manifest_url = config.service_endpoint.set(path='/manifest/files',
+ args=dict(catalog=catalog,
+ filters=json.dumps(filters),
+ format=format.value))
+ method = PUT
+ responses = []
+ while True:
+ response = self._get_url(method, manifest_url)
+ if response.status == 301:
+ responses.append(response)
+ # Request the same manifest without following the
+ # redirect in order to expose the race condition that
+ # returns an untagged cached manifest. This happens
+ # when a step function execution has generated a
+ # manifest but is still in the process of tagging the
+ # object. The more requests we make, the more likely it
+ # is that we catch the execution in this racy state. We
+ # still have to throttle the requests in order to
+ # prevent tripping the WAF rate limit.
+ rate_limit = config.waf_rate_rule_period / config.waf_rate_rule_limit
+ time.sleep(rate_limit)
+ elif response.status == 302:
+ responses.append(response)
+ method, manifest_url = GET, furl(response.headers['Location'])
+ else:
+ break
- validator(catalog, response.data)
- execution_ids = self._manifest_execution_ids(responses, fetch=False)
- self.assertEqual(1, len(execution_ids))
+ validator(catalog, response.data)
+ execution_ids = self._manifest_execution_ids(responses, fetch=False)
+ self.assertEqual(1, len(execution_ids))
def _manifest_execution_ids(self,
responses: list[urllib3.HTTPResponse],|
@dsotirho-ucsc, same drill as last time we tried to merge this: #6003 (comment) with the additional step documented here: #6003 (comment) |
|
As expected, PR failed IT on sandbox. Drop commits have been removed. CloudWatch Logs on sandbox: |
hannes-ucsc
left a comment
There was a problem hiding this comment.
The sandbox failures look good. We don't need to perform that experiment again.
But I do want to make a few refactorings:
Subject: [PATCH] REVIEW 2
---
Index: test/integration_test.py
IDEA additional info:
Subsystem: com.intellij.openapi.diff.impl.patch.CharsetEP
<+>UTF-8
===================================================================
diff --git a/test/integration_test.py b/test/integration_test.py
--- a/test/integration_test.py (revision 9cfdbee5fc17671da184e471848373c42e313795)
+++ b/test/integration_test.py (date 1712346000951)
@@ -578,15 +578,7 @@
self._check_endpoint(GET, path, args=args, endpoint=endpoint)
def _test_manifest(self, catalog: CatalogName):
- supported_formats = self.metadata_plugin(catalog).manifest_formats
- assert supported_formats
- validators: dict[ManifestFormat, Callable[[str, bytes], None]] = {
- ManifestFormat.compact: self._check_compact_manifest,
- ManifestFormat.terra_bdbag: self._check_terra_bdbag_manifest,
- ManifestFormat.terra_pfb: self._check_terra_pfb_manifest,
- ManifestFormat.curl: self._check_curl_manifest,
- ManifestFormat.verbatim_jsonl: self._check_jsonl_manifest
- }
+ supported_formats = self._manifest_formats(catalog)
for format in [None, *supported_formats]:
filters = self._manifest_filters(catalog)
first_fetch = bool(self.random.getrandbits(1))
@@ -594,9 +586,8 @@
with self.subTest('manifest', catalog=catalog, format=format, fetch=fetch):
args = dict(catalog=catalog, filters=json.dumps(filters))
if format is None:
- validator = validators[first(supported_formats)]
+ format = first(supported_formats)
else:
- validator = validators[format]
args['format'] = format.value
# Wrap self._get_url to collect all HTTP responses
@@ -616,7 +607,7 @@
def worker(_):
response = self._check_endpoint(PUT, '/manifest/files', args=args, fetch=fetch)
- validator(catalog, response)
+ self._manifest_validators[format](catalog, response)
num_workers = 3
with ThreadPoolExecutor(max_workers=num_workers) as tpe:
@@ -646,14 +637,25 @@
}
}
- def _test_manifest_tagging_race(self, catalog: CatalogName):
- validators: dict[ManifestFormat, Callable[[CatalogName, bytes], None]] = {
+ def _manifest_formats(self, catalog: CatalogName) -> Sequence[ManifestFormat]:
+ supported_formats = self.metadata_plugin(catalog).manifest_formats
+ assert supported_formats
+ return supported_formats
+
+ @cached_property
+ def _manifest_validators(self) -> dict[ManifestFormat, Callable[[str, bytes], None]]:
+ return {
ManifestFormat.compact: self._check_compact_manifest,
- ManifestFormat.curl: self._check_curl_manifest
+ ManifestFormat.terra_bdbag: self._check_terra_bdbag_manifest,
+ ManifestFormat.terra_pfb: self._check_terra_pfb_manifest,
+ ManifestFormat.curl: self._check_curl_manifest,
+ ManifestFormat.verbatim_jsonl: self._check_jsonl_manifest
}
- plugin = self.metadata_plugin(catalog)
- for format, validator in validators.items():
- if format in plugin.manifest_formats:
+
+ def _test_manifest_tagging_race(self, catalog: CatalogName):
+ supported_formats = self._manifest_formats(catalog)
+ for format in [ManifestFormat.compact, ManifestFormat.curl]:
+ if format in supported_formats:
with self.subTest('manifest_tagging_race', catalog=catalog, format=format):
filters = self._manifest_filters(catalog)
manifest_url = config.service_endpoint.set(path='/manifest/files',
@@ -684,11 +686,11 @@
responses.append(response)
method, manifest_url = GET, furl(response.headers['Location'])
else:
+ assert response.status == 200, response
+ self._manifest_validators[format](catalog, response.data)
break
- validator(catalog, response.data)
- execution_ids = self._manifest_execution_ids(responses,
- fetch=False)
+ execution_ids = self._manifest_execution_ids(responses, fetch=False)
self.assertEqual(1, len(execution_ids))
def _manifest_execution_ids(self,We can leave the drop commits off.
src/azul/service/manifest_service.py
Outdated
| # Paged manifest generators apply the tag after creating | ||
| # the object. If we happen to be right in between the | ||
| # creation and application of tags, we pretend that the | ||
| # manifest object doesn't exist, assuming that the caller | ||
| # will tell the client to check back later. Note that this | ||
| # involves the attempted creation of step-function | ||
| # execution, but because step-functions generate manifests | ||
| # idempotentently, subsequent requests for the same | ||
| # manifest are associated to a single SF execution. |
There was a problem hiding this comment.
| # Paged manifest generators apply the tag after creating | |
| # the object. If we happen to be right in between the | |
| # creation and application of tags, we pretend that the | |
| # manifest object doesn't exist, assuming that the caller | |
| # will tell the client to check back later. Note that this | |
| # involves the attempted creation of step-function | |
| # execution, but because step-functions generate manifests | |
| # idempotentently, subsequent requests for the same | |
| # manifest are associated to a single SF execution. | |
| # While unpaged manifest generators apply the tag *at* object creation, paged ones do so in a separate request. If we happen to get here right in between the | |
| # creation and application of tags, we pretend that the | |
| # manifest object doesn't exist, relying on client to check back later. Note that this | |
| # involves the attempted creation of a step-function (SF) | |
| # execution, but because SFs generate manifests | |
| # idempotently, subsequent requests for the same | |
| # manifest are associated with a single SF execution. |
There was a problem hiding this comment.
I don't understand the second sentence. Please raise in PL.
hannes-ucsc
left a comment
There was a problem hiding this comment.
Conflicts, unfortunately.
Security design review
|
…ags (#5958) Add test to expose manifest tagging race condition
Connected issues: #5958
Checklist
Author
developissues/<GitHub handle of author>/<issue#>-<slug>1 when the issue title describes a problem, the corresponding PR
title is
Fix:followed by the issue titleAuthor (partiality)
ptag to titles of partial commitspartialor completely resolves all connected issuespartiallabelAuthor (chains)
baseor this PR is not chained to another PRchainedor is not chained to another PRAuthor (reindex, API changes)
rtag to commit title or the changes introduced by this PR will not require reindexing of any deploymentreindex:devor the changes introduced by it will not require reindexing ofdevreindex:anvildevor the changes introduced by it will not require reindexing ofanvildevreindex:anvilprodor the changes introduced by it will not require reindexing ofanvilprodreindex:prodor the changes introduced by it will not require reindexing ofprodreindex:partialand its description documents the specific reindexing procedure fordev,anvildev,anvilprodandprodor requires a full reindex or carries none of the labelsreindex:dev,reindex:anvildev,reindex:anvilprodandreindex:prodAPIor this PR does not modify a REST APIa(A) tag to commit title for backwards (in)compatible changes or this PR does not modify a REST APIapp.pyor this PR does not modify a REST APIAuthor (upgrading deployments)
make image_manifests.jsonand committed the resulting changes or this PR does not modifyazul_docker_images, or any other variables referenced in the definition of that variableutag to commit title or this PR does not require upgrading deploymentsupgradeor does not require upgrading deploymentsdeploy:sharedor does not modifyimage_manifests.json, and does not require deploying thesharedcomponent for any other reasondeploy:gitlabor does not require deploying thegitlabcomponentdeploy:runneror does not require deploying therunnerimageAuthor (hotfixes)
Ftag to main commit title or this PR does not include permanent fix for a temporary hotfixanvilprodandprod) have temporary hotfixes for any of the issues connected to this PRAuthor (before every review)
develop, squashed old fixupsmake requirements_updateor this PR does not modifyrequirements*.txt,common.mk,MakefileandDockerfileRtag to commit title or this PR does not modifyrequirements*.txtreqsor does not modifyrequirements*.txtmake integration_testpasses in personal deployment or this PR does not modify functionality that could affect the IT outcomePeer reviewer (after approval)
System administrator (after approval)
demoorno demono demono sandboxN reviewslabel is accurateOperator (before pushing merge the commit)
reindex:…labels andrcommit title tagno demodevelop_select dev.shared && CI_COMMIT_REF_NAME=develop make -C terraform/shared apply_keep_unusedor this PR is not labeleddeploy:shared_select dev.gitlab && CI_COMMIT_REF_NAME=develop make -C terraform/gitlab applyor this PR is not labeleddeploy:gitlab_select anvildev.shared && CI_COMMIT_REF_NAME=develop make -C terraform/shared apply_keep_unusedor this PR is not labeleddeploy:shared_select anvildev.gitlab && CI_COMMIT_REF_NAME=develop make -C terraform/gitlab applyor this PR is not labeleddeploy:gitlabdeploy:gitlabdeploy:gitlabSystem administrator
dev.gitlabare complete or this PR is not labeleddeploy:gitlabanvildev.gitlabare complete or this PR is not labeleddeploy:gitlabOperator (before pushing merge the commit)
_select dev.gitlab && make -C terraform/gitlab/runneror this PR is not labeleddeploy:runner_select anvildev.gitlab && make -C terraform/gitlab/runneror this PR is not labeleddeploy:runnersandboxlabel or PR is labeledno sandboxdevor PR is labeledno sandboxanvildevor PR is labeledno sandboxsandboxdeployment or PR is labeledno sandboxanvilboxdeployment or PR is labeledno sandboxsandboxdeployment or PR is labeledno sandboxanvilboxdeployment or PR is labeledno sandboxsandboxor this PR does not remove catalogs or otherwise causes unreferenced indices indevanvilboxor this PR does not remove catalogs or otherwise causes unreferenced indices inanvildevsandboxor this PR is not labeledreindex:devanvilboxor this PR is not labeledreindex:anvildevsandboxor this PR is not labeledreindex:devanvilboxor this PR is not labeledreindex:anvildevpif the PR is also labeledpartialOperator (chain shortening)
developor this PR is not labeledbasechainedlabel from the blocked PR or this PR is not labeledbasebasebaselabel from this PR or this PR is not labeledbaseOperator (after pushing the merge commit)
devanvildevdevdevanvildevanvildev_select dev.shared && make -C terraform/shared applyor this PR is not labeleddeploy:shared_select anvildev.shared && make -C terraform/shared applyor this PR is not labeleddeploy:shareddevanvildevOperator (reindex)
devor this PR is neither labeledreindex:partialnorreindex:devanvildevor this PR is neither labeledreindex:partialnorreindex:anvildevdevor this PR is neither labeledreindex:partialnorreindex:devanvildevor this PR is neither labeledreindex:partialnorreindex:anvildevdevor this PR is neither labeledreindex:partialnorreindex:devanvildevor this PR is neither labeledreindex:partialnorreindex:anvildevdevor this PR does not require reindexingdevanvildevor this PR does not require reindexinganvildevdevor this PR does not require reindexingdevanvildevor this PR does not require reindexinganvildevdevor this PR does not require reindexingdevanvildevor this PR does not require reindexinganvildevOperator
deploy:shared,deploy:gitlab,deploy:runner,reindex:partial,reindex:anvilprodandreindex:prodlabels to the next promotion PRs or this PR carries none of these labelsdeploy:shared,deploy:gitlab,deploy:runner,reindex:partial,reindex:anvilprodandreindex:prodlabels, from the description of this PR to that of the next promotion PRs or this PR carries none of these labelsShorthand for review comments
Lline is too longWline wrapping is wrongQbad quotesFother formatting problem