From da17c26198b154d8994096cb2cb59afa8ea3e326 Mon Sep 17 00:00:00 2001
From: Matt Robenolt <matt@ydekproductions.com>
Date: Wed, 14 Dec 2016 14:56:26 -0800
Subject: [PATCH 1/3] javascript: don't gzip files before caching

This is unnecessary because python-memcached will gzip for us. By doing
this ourselves, we're actually adding extra work by double gzipping.
---
 src/sentry/lang/javascript/processor.py | 30 +++++--------------------
 src/sentry/utils/files.py               |  9 ++------
 2 files changed, 7 insertions(+), 32 deletions(-)

diff --git a/src/sentry/lang/javascript/processor.py b/src/sentry/lang/javascript/processor.py
index 9f0abb24e6a7cb..003929facd05a0 100644
--- a/src/sentry/lang/javascript/processor.py
+++ b/src/sentry/lang/javascript/processor.py
@@ -8,7 +8,6 @@
 import base64
 import six
 import time
-import zlib
 
 from django.conf import settings
 from django.core.exceptions import SuspiciousOperation
@@ -218,7 +217,7 @@ def discover_sourcemap(result):
 
 
 def fetch_release_file(filename, release):
-    cache_key = 'releasefile:v1:%s:%s' % (
+    cache_key = 'releasefile:v2:%s:%s' % (
         release.id,
         md5_text(filename).hexdigest(),
     )
@@ -267,7 +266,7 @@ def fetch_release_file(filename, release):
         try:
             with metrics.timer('sourcemaps.release_file_read'):
                 with releasefile.file.getfile() as fp:
-                    z_body, body = compress_file(fp)
+                    body = compress_file(fp)
         except Exception as e:
             logger.exception(six.text_type(e))
             cache.set(cache_key, -1, 3600)
@@ -276,20 +275,12 @@ def fetch_release_file(filename, release):
             headers = {k.lower(): v for k, v in releasefile.file.headers.items()}
             encoding = get_encoding_from_headers(headers)
             result = (headers, body, 200, encoding)
-            cache.set(cache_key, (headers, z_body, 200, encoding), 3600)
+            cache.set(cache_key, result, 3600)
 
     elif result == -1:
         # We cached an error, so normalize
         # it down to None
         result = None
-    else:
-        # Previous caches would be a 3-tuple instead of a 4-tuple,
-        # so this is being maintained for backwards compatibility
-        try:
-            encoding = result[3]
-        except IndexError:
-            encoding = None
-        result = (result[0], zlib.decompress(result[1]), result[2], encoding)
 
     return result
 
@@ -313,7 +304,7 @@ def fetch_file(url, project=None, release=None, allow_scraping=True):
     else:
         result = None
 
-    cache_key = 'source:cache:v3:%s' % (
+    cache_key = 'source:cache:v4:%s' % (
         md5_text(url).hexdigest(),
     )
 
@@ -327,16 +318,6 @@ def fetch_file(url, project=None, release=None, allow_scraping=True):
 
         logger.debug('Checking cache for url %r', url)
         result = cache.get(cache_key)
-        if result is not None:
-            # Previous caches would be a 3-tuple instead of a 4-tuple,
-            # so this is being maintained for backwards compatibility
-            try:
-                encoding = result[3]
-            except IndexError:
-                encoding = None
-            # We got a cache hit, but the body is compressed, so we
-            # need to decompress it before handing it off
-            result = (result[0], zlib.decompress(result[1]), result[2], encoding)
 
     if result is None:
         # lock down domains that are problematic
@@ -438,11 +419,10 @@ def fetch_file(url, project=None, release=None, allow_scraping=True):
                     raise CannotFetchSource(error)
 
                 body = b''.join(contents)
-                z_body = zlib.compress(body)
                 headers = {k.lower(): v for k, v in response.headers.items()}
                 encoding = response.encoding
 
-                cache.set(cache_key, (headers, z_body, response.status_code, encoding), 60)
+                cache.set(cache_key, (headers, body, response.status_code, encoding), 60)
                 result = (headers, body, response.status_code, encoding)
             finally:
                 if response is not None:
diff --git a/src/sentry/utils/files.py b/src/sentry/utils/files.py
index 1252e0002b124a..470cf988bf4037 100644
--- a/src/sentry/utils/files.py
+++ b/src/sentry/utils/files.py
@@ -7,14 +7,9 @@
 """
 from __future__ import absolute_import
 
-import zlib
 
-
-def compress_file(fp, level=6):
-    compressor = zlib.compressobj(level)
-    z_chunks = []
+def compress_file(fp):
     chunks = []
     for chunk in fp.chunks():
         chunks.append(chunk)
-        z_chunks.append(compressor.compress(chunk))
-    return (b''.join(z_chunks) + compressor.flush(), b''.join(chunks))
+    return b''.join(chunks)

From becab8097e9f3156ce64e30fac5af7e0f5bdb9c2 Mon Sep 17 00:00:00 2001
From: Matt Robenolt <matt@ydekproductions.com>
Date: Wed, 14 Dec 2016 16:20:48 -0800
Subject: [PATCH 2/3] javascript: handle artifacts uploaded as gzip or deflate

---
 src/sentry/lang/javascript/processor.py       |  32 ++++-
 src/sentry/utils/files.py                     |  15 ---
 .../sentry/lang/javascript/test_processor.py  | 121 ++++++++++++++++++
 3 files changed, 149 insertions(+), 19 deletions(-)
 delete mode 100644 src/sentry/utils/files.py

diff --git a/src/sentry/lang/javascript/processor.py b/src/sentry/lang/javascript/processor.py
index 003929facd05a0..f60692ace7e3c1 100644
--- a/src/sentry/lang/javascript/processor.py
+++ b/src/sentry/lang/javascript/processor.py
@@ -17,6 +17,7 @@
 from requests.utils import get_encoding_from_headers
 from six.moves.urllib.parse import urlparse, urljoin, urlsplit
 from libsourcemap import from_json as view_from_json
+from urllib3.response import GzipDecoder, DeflateDecoder
 
 # In case SSL is unavailable (light builds) we can't import this here.
 try:
@@ -31,7 +32,6 @@ class ZeroReturnError(Exception):
 from sentry.interfaces.stacktrace import Stacktrace
 from sentry.models import EventError, Release, ReleaseFile
 from sentry.utils.cache import cache
-from sentry.utils.files import compress_file
 from sentry.utils.hashlib import md5_text
 from sentry.utils.http import is_valid_origin
 from sentry.utils.strings import truncatechars
@@ -136,6 +136,17 @@ def trim_line(line, column=0):
     return line
 
 
+# TODO(mattrobenolt): Generalize on this and leverage the urllib3
+# decoders inside coreapi as well so we have a unified method for
+# handling gzip/deflate decompression. urllib3 is pretty good at this.
+def get_content_decoder_from_headers(headers):
+    content_encoding = headers.get('content-encoding', '').lower()
+    if content_encoding == 'gzip':
+        return GzipDecoder()
+    if content_encoding == 'deflate':
+        return DeflateDecoder()
+
+
 def get_source_context(source, lineno, colno, context=LINES_OF_CONTEXT):
     if not source:
         return [], '', []
@@ -264,17 +275,30 @@ def fetch_release_file(filename, release):
         logger.debug('Found release artifact %r (id=%s, release_id=%s)',
                      filename, releasefile.id, release.id)
         try:
+            body = []
             with metrics.timer('sourcemaps.release_file_read'):
                 with releasefile.file.getfile() as fp:
-                    body = compress_file(fp)
+                    for chunk in fp.chunks():
+                        body.append(chunk)
+            body = b''.join(body)
         except Exception as e:
             logger.exception(six.text_type(e))
             cache.set(cache_key, -1, 3600)
             result = None
         else:
             headers = {k.lower(): v for k, v in releasefile.file.headers.items()}
-            encoding = get_encoding_from_headers(headers)
-            result = (headers, body, 200, encoding)
+            # Handle gzip/deflate compression depending on Content-Encoding header
+            decoder = get_content_decoder_from_headers(headers)
+            if decoder:
+                try:
+                    body = decoder.decompress(body)
+                except Exception:
+                    raise CannotFetchSource({
+                        'type': EventError.JS_INVALID_SOURCE_ENCODING,
+                        'value': headers.get('content-encoding'),
+                        'url': expose_url(filename),
+                    })
+            result = (headers, body, 200, get_encoding_from_headers(headers))
             cache.set(cache_key, result, 3600)
 
     elif result == -1:
diff --git a/src/sentry/utils/files.py b/src/sentry/utils/files.py
deleted file mode 100644
index 470cf988bf4037..00000000000000
--- a/src/sentry/utils/files.py
+++ /dev/null
@@ -1,15 +0,0 @@
-"""
-sentry.utils.files
-~~~~~~~~~~~~~~~~~~
-
-:copyright: (c) 2010-2014 by the Sentry Team, see AUTHORS for more details.
-:license: BSD, see LICENSE for more details.
-"""
-from __future__ import absolute_import
-
-
-def compress_file(fp):
-    chunks = []
-    for chunk in fp.chunks():
-        chunks.append(chunk)
-    return b''.join(chunks)
diff --git a/tests/sentry/lang/javascript/test_processor.py b/tests/sentry/lang/javascript/test_processor.py
index c5862e2ab4e02d..78d29100b80c70 100644
--- a/tests/sentry/lang/javascript/test_processor.py
+++ b/tests/sentry/lang/javascript/test_processor.py
@@ -5,6 +5,7 @@
 import pytest
 import responses
 import six
+import zlib
 from libsourcemap import Token
 
 from mock import patch
@@ -72,6 +73,126 @@ def test_unicode(self):
 
         assert result == new_result
 
+    def test_deflate(self):
+        project = self.project
+        release = Release.objects.create(
+            project=project,
+            organization_id=project.organization_id,
+            version='abc',
+        )
+        release.add_project(project)
+
+        file = File.objects.create(
+            name='file.min.js',
+            type='release.file',
+            headers={
+                'Content-Type': 'application/json; charset=utf-8',
+                'Content-Encoding': 'deflate'
+            },
+        )
+
+        binary_body = unicode_body.encode('utf-8')
+        file.putfile(six.BytesIO(zlib.compress(binary_body)))
+
+        ReleaseFile.objects.create(
+            name='file.min.js',
+            release=release,
+            project=project,
+            file=file,
+        )
+
+        result = fetch_release_file('file.min.js', release)
+
+        assert type(result[1]) is six.binary_type
+        assert result == (
+            {'content-type': 'application/json; charset=utf-8', 'content-encoding': 'deflate'},
+            binary_body,
+            200,
+            'utf-8',
+        )
+
+        # test with cache hit, which should be compressed
+        new_result = fetch_release_file('file.min.js', release)
+
+        assert result == new_result
+
+    def test_gzip(self):
+        project = self.project
+        release = Release.objects.create(
+            project=project,
+            organization_id=project.organization_id,
+            version='abc',
+        )
+        release.add_project(project)
+
+        file = File.objects.create(
+            name='file.min.js',
+            type='release.file',
+            headers={
+                'Content-Type': 'application/json; charset=utf-8',
+                'Content-Encoding': 'gzip'
+            },
+        )
+
+        binary_body = unicode_body.encode('utf-8')
+        compressor = zlib.compressobj(6, zlib.DEFLATED, 16 + zlib.MAX_WBITS)
+        file.putfile(six.BytesIO(b''.join([
+            compressor.compress(binary_body),
+            compressor.flush(),
+        ])))
+
+        ReleaseFile.objects.create(
+            name='file.min.js',
+            release=release,
+            project=project,
+            file=file,
+        )
+
+        result = fetch_release_file('file.min.js', release)
+
+        assert type(result[1]) is six.binary_type
+        assert result == (
+            {'content-type': 'application/json; charset=utf-8', 'content-encoding': 'gzip'},
+            binary_body,
+            200,
+            'utf-8',
+        )
+
+        # test with cache hit, which should be compressed
+        new_result = fetch_release_file('file.min.js', release)
+
+        assert result == new_result
+
+    def test_garbage_encoding(self):
+        project = self.project
+        release = Release.objects.create(
+            project=project,
+            organization_id=project.organization_id,
+            version='abc',
+        )
+        release.add_project(project)
+
+        file = File.objects.create(
+            name='file.min.js',
+            type='release.file',
+            headers={
+                'Content-Type': 'application/json; charset=utf-8',
+                'Content-Encoding': 'gzip'
+            },
+        )
+
+        file.putfile(six.BytesIO('notgzipped'))
+
+        ReleaseFile.objects.create(
+            name='file.min.js',
+            release=release,
+            project=project,
+            file=file,
+        )
+
+        with pytest.raises(CannotFetchSource):
+            fetch_release_file('file.min.js', release)
+
 
 class FetchFileTest(TestCase):
     @responses.activate

From ffb33a7b2f3ae5852b9c0fc63ffe97d2dbdb9b2b Mon Sep 17 00:00:00 2001
From: Matt Robenolt <matt@ydekproductions.com>
Date: Wed, 14 Dec 2016 16:31:41 -0800
Subject: [PATCH 3/3] Update CHANGES

---
 CHANGES | 1 +
 1 file changed, 1 insertion(+)

diff --git a/CHANGES b/CHANGES
index 6ac2d3f8d69cbc..f08c4242d33877 100644
--- a/CHANGES
+++ b/CHANGES
@@ -6,6 +6,7 @@ Version 8.12 (Unreleased)
   more accurately and will report system errors them to the internal logger.
 - Added data migration to backfill legacy release data
 - Added data migration to backfill legacy commit data
+- Allow gziped/deflated JavaScript artifacts to be uploaded through the API.
 
 SDKs
 ~~~~