From da17c26198b154d8994096cb2cb59afa8ea3e326 Mon Sep 17 00:00:00 2001 From: Matt Robenolt Date: Wed, 14 Dec 2016 14:56:26 -0800 Subject: [PATCH 1/3] javascript: don't gzip files before caching This is unnecessary because python-memcached will gzip for us. By doing this ourselves, we're actually adding extra work by double gzipping. --- src/sentry/lang/javascript/processor.py | 30 +++++-------------------- src/sentry/utils/files.py | 9 ++------ 2 files changed, 7 insertions(+), 32 deletions(-) diff --git a/src/sentry/lang/javascript/processor.py b/src/sentry/lang/javascript/processor.py index 9f0abb24e6a7cb..003929facd05a0 100644 --- a/src/sentry/lang/javascript/processor.py +++ b/src/sentry/lang/javascript/processor.py @@ -8,7 +8,6 @@ import base64 import six import time -import zlib from django.conf import settings from django.core.exceptions import SuspiciousOperation @@ -218,7 +217,7 @@ def discover_sourcemap(result): def fetch_release_file(filename, release): - cache_key = 'releasefile:v1:%s:%s' % ( + cache_key = 'releasefile:v2:%s:%s' % ( release.id, md5_text(filename).hexdigest(), ) @@ -267,7 +266,7 @@ def fetch_release_file(filename, release): try: with metrics.timer('sourcemaps.release_file_read'): with releasefile.file.getfile() as fp: - z_body, body = compress_file(fp) + body = compress_file(fp) except Exception as e: logger.exception(six.text_type(e)) cache.set(cache_key, -1, 3600) @@ -276,20 +275,12 @@ def fetch_release_file(filename, release): headers = {k.lower(): v for k, v in releasefile.file.headers.items()} encoding = get_encoding_from_headers(headers) result = (headers, body, 200, encoding) - cache.set(cache_key, (headers, z_body, 200, encoding), 3600) + cache.set(cache_key, result, 3600) elif result == -1: # We cached an error, so normalize # it down to None result = None - else: - # Previous caches would be a 3-tuple instead of a 4-tuple, - # so this is being maintained for backwards compatibility - try: - encoding = result[3] - except IndexError: - encoding = None - result = (result[0], zlib.decompress(result[1]), result[2], encoding) return result @@ -313,7 +304,7 @@ def fetch_file(url, project=None, release=None, allow_scraping=True): else: result = None - cache_key = 'source:cache:v3:%s' % ( + cache_key = 'source:cache:v4:%s' % ( md5_text(url).hexdigest(), ) @@ -327,16 +318,6 @@ def fetch_file(url, project=None, release=None, allow_scraping=True): logger.debug('Checking cache for url %r', url) result = cache.get(cache_key) - if result is not None: - # Previous caches would be a 3-tuple instead of a 4-tuple, - # so this is being maintained for backwards compatibility - try: - encoding = result[3] - except IndexError: - encoding = None - # We got a cache hit, but the body is compressed, so we - # need to decompress it before handing it off - result = (result[0], zlib.decompress(result[1]), result[2], encoding) if result is None: # lock down domains that are problematic @@ -438,11 +419,10 @@ def fetch_file(url, project=None, release=None, allow_scraping=True): raise CannotFetchSource(error) body = b''.join(contents) - z_body = zlib.compress(body) headers = {k.lower(): v for k, v in response.headers.items()} encoding = response.encoding - cache.set(cache_key, (headers, z_body, response.status_code, encoding), 60) + cache.set(cache_key, (headers, body, response.status_code, encoding), 60) result = (headers, body, response.status_code, encoding) finally: if response is not None: diff --git a/src/sentry/utils/files.py b/src/sentry/utils/files.py index 1252e0002b124a..470cf988bf4037 100644 --- a/src/sentry/utils/files.py +++ b/src/sentry/utils/files.py @@ -7,14 +7,9 @@ """ from __future__ import absolute_import -import zlib - -def compress_file(fp, level=6): - compressor = zlib.compressobj(level) - z_chunks = [] +def compress_file(fp): chunks = [] for chunk in fp.chunks(): chunks.append(chunk) - z_chunks.append(compressor.compress(chunk)) - return (b''.join(z_chunks) + compressor.flush(), b''.join(chunks)) + return b''.join(chunks) From becab8097e9f3156ce64e30fac5af7e0f5bdb9c2 Mon Sep 17 00:00:00 2001 From: Matt Robenolt Date: Wed, 14 Dec 2016 16:20:48 -0800 Subject: [PATCH 2/3] javascript: handle artifacts uploaded as gzip or deflate --- src/sentry/lang/javascript/processor.py | 32 ++++- src/sentry/utils/files.py | 15 --- .../sentry/lang/javascript/test_processor.py | 121 ++++++++++++++++++ 3 files changed, 149 insertions(+), 19 deletions(-) delete mode 100644 src/sentry/utils/files.py diff --git a/src/sentry/lang/javascript/processor.py b/src/sentry/lang/javascript/processor.py index 003929facd05a0..f60692ace7e3c1 100644 --- a/src/sentry/lang/javascript/processor.py +++ b/src/sentry/lang/javascript/processor.py @@ -17,6 +17,7 @@ from requests.utils import get_encoding_from_headers from six.moves.urllib.parse import urlparse, urljoin, urlsplit from libsourcemap import from_json as view_from_json +from urllib3.response import GzipDecoder, DeflateDecoder # In case SSL is unavailable (light builds) we can't import this here. try: @@ -31,7 +32,6 @@ class ZeroReturnError(Exception): from sentry.interfaces.stacktrace import Stacktrace from sentry.models import EventError, Release, ReleaseFile from sentry.utils.cache import cache -from sentry.utils.files import compress_file from sentry.utils.hashlib import md5_text from sentry.utils.http import is_valid_origin from sentry.utils.strings import truncatechars @@ -136,6 +136,17 @@ def trim_line(line, column=0): return line +# TODO(mattrobenolt): Generalize on this and leverage the urllib3 +# decoders inside coreapi as well so we have a unified method for +# handling gzip/deflate decompression. urllib3 is pretty good at this. +def get_content_decoder_from_headers(headers): + content_encoding = headers.get('content-encoding', '').lower() + if content_encoding == 'gzip': + return GzipDecoder() + if content_encoding == 'deflate': + return DeflateDecoder() + + def get_source_context(source, lineno, colno, context=LINES_OF_CONTEXT): if not source: return [], '', [] @@ -264,17 +275,30 @@ def fetch_release_file(filename, release): logger.debug('Found release artifact %r (id=%s, release_id=%s)', filename, releasefile.id, release.id) try: + body = [] with metrics.timer('sourcemaps.release_file_read'): with releasefile.file.getfile() as fp: - body = compress_file(fp) + for chunk in fp.chunks(): + body.append(chunk) + body = b''.join(body) except Exception as e: logger.exception(six.text_type(e)) cache.set(cache_key, -1, 3600) result = None else: headers = {k.lower(): v for k, v in releasefile.file.headers.items()} - encoding = get_encoding_from_headers(headers) - result = (headers, body, 200, encoding) + # Handle gzip/deflate compression depending on Content-Encoding header + decoder = get_content_decoder_from_headers(headers) + if decoder: + try: + body = decoder.decompress(body) + except Exception: + raise CannotFetchSource({ + 'type': EventError.JS_INVALID_SOURCE_ENCODING, + 'value': headers.get('content-encoding'), + 'url': expose_url(filename), + }) + result = (headers, body, 200, get_encoding_from_headers(headers)) cache.set(cache_key, result, 3600) elif result == -1: diff --git a/src/sentry/utils/files.py b/src/sentry/utils/files.py deleted file mode 100644 index 470cf988bf4037..00000000000000 --- a/src/sentry/utils/files.py +++ /dev/null @@ -1,15 +0,0 @@ -""" -sentry.utils.files -~~~~~~~~~~~~~~~~~~ - -:copyright: (c) 2010-2014 by the Sentry Team, see AUTHORS for more details. -:license: BSD, see LICENSE for more details. -""" -from __future__ import absolute_import - - -def compress_file(fp): - chunks = [] - for chunk in fp.chunks(): - chunks.append(chunk) - return b''.join(chunks) diff --git a/tests/sentry/lang/javascript/test_processor.py b/tests/sentry/lang/javascript/test_processor.py index c5862e2ab4e02d..78d29100b80c70 100644 --- a/tests/sentry/lang/javascript/test_processor.py +++ b/tests/sentry/lang/javascript/test_processor.py @@ -5,6 +5,7 @@ import pytest import responses import six +import zlib from libsourcemap import Token from mock import patch @@ -72,6 +73,126 @@ def test_unicode(self): assert result == new_result + def test_deflate(self): + project = self.project + release = Release.objects.create( + project=project, + organization_id=project.organization_id, + version='abc', + ) + release.add_project(project) + + file = File.objects.create( + name='file.min.js', + type='release.file', + headers={ + 'Content-Type': 'application/json; charset=utf-8', + 'Content-Encoding': 'deflate' + }, + ) + + binary_body = unicode_body.encode('utf-8') + file.putfile(six.BytesIO(zlib.compress(binary_body))) + + ReleaseFile.objects.create( + name='file.min.js', + release=release, + project=project, + file=file, + ) + + result = fetch_release_file('file.min.js', release) + + assert type(result[1]) is six.binary_type + assert result == ( + {'content-type': 'application/json; charset=utf-8', 'content-encoding': 'deflate'}, + binary_body, + 200, + 'utf-8', + ) + + # test with cache hit, which should be compressed + new_result = fetch_release_file('file.min.js', release) + + assert result == new_result + + def test_gzip(self): + project = self.project + release = Release.objects.create( + project=project, + organization_id=project.organization_id, + version='abc', + ) + release.add_project(project) + + file = File.objects.create( + name='file.min.js', + type='release.file', + headers={ + 'Content-Type': 'application/json; charset=utf-8', + 'Content-Encoding': 'gzip' + }, + ) + + binary_body = unicode_body.encode('utf-8') + compressor = zlib.compressobj(6, zlib.DEFLATED, 16 + zlib.MAX_WBITS) + file.putfile(six.BytesIO(b''.join([ + compressor.compress(binary_body), + compressor.flush(), + ]))) + + ReleaseFile.objects.create( + name='file.min.js', + release=release, + project=project, + file=file, + ) + + result = fetch_release_file('file.min.js', release) + + assert type(result[1]) is six.binary_type + assert result == ( + {'content-type': 'application/json; charset=utf-8', 'content-encoding': 'gzip'}, + binary_body, + 200, + 'utf-8', + ) + + # test with cache hit, which should be compressed + new_result = fetch_release_file('file.min.js', release) + + assert result == new_result + + def test_garbage_encoding(self): + project = self.project + release = Release.objects.create( + project=project, + organization_id=project.organization_id, + version='abc', + ) + release.add_project(project) + + file = File.objects.create( + name='file.min.js', + type='release.file', + headers={ + 'Content-Type': 'application/json; charset=utf-8', + 'Content-Encoding': 'gzip' + }, + ) + + file.putfile(six.BytesIO('notgzipped')) + + ReleaseFile.objects.create( + name='file.min.js', + release=release, + project=project, + file=file, + ) + + with pytest.raises(CannotFetchSource): + fetch_release_file('file.min.js', release) + class FetchFileTest(TestCase): @responses.activate From ffb33a7b2f3ae5852b9c0fc63ffe97d2dbdb9b2b Mon Sep 17 00:00:00 2001 From: Matt Robenolt Date: Wed, 14 Dec 2016 16:31:41 -0800 Subject: [PATCH 3/3] Update CHANGES --- CHANGES | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGES b/CHANGES index 6ac2d3f8d69cbc..f08c4242d33877 100644 --- a/CHANGES +++ b/CHANGES @@ -6,6 +6,7 @@ Version 8.12 (Unreleased) more accurately and will report system errors them to the internal logger. - Added data migration to backfill legacy release data - Added data migration to backfill legacy commit data +- Allow gziped/deflated JavaScript artifacts to be uploaded through the API. SDKs ~~~~