From 3895a7971118e51cba5cfc08862ecba8558b711c Mon Sep 17 00:00:00 2001 From: Olivier Grisel Date: Thu, 9 Nov 2017 18:45:22 +0100 Subject: [PATCH 01/39] Do no allocate large temp buffers in _Pickler.dump For all protocols: avoid concatenating large bytes and str with their opcode and size header but instead issue an individual call to self.write(data). For protocol 4: if the size of the opcode + size header + data is larger than the target frame size, commit the current frame and bypass io.BytesIO to write the next frame directly to the underlying file object. --- Lib/pickle.py | 28 ++++++++++++++++++++++------ 1 file changed, 22 insertions(+), 6 deletions(-) diff --git a/Lib/pickle.py b/Lib/pickle.py index faa8fd7e557f9c..42d05f98f2f489 100644 --- a/Lib/pickle.py +++ b/Lib/pickle.py @@ -216,6 +216,21 @@ def write(self, data): else: return self.file_write(data) + def write_many(self, *chunks): + total_size = sum(len(c) for c in chunks) + if self.current_frame and total_size >= self._FRAME_SIZE_TARGET: + # Terminate the current frame to write the next frame directly into + # the underlying file to skip the unnecessary memory allocations + # of a large temporary buffer. + self.commit_frame(force=True) + write = self.file_write + write(FRAME + pack("= 1 @@ -694,11 +710,11 @@ def save_bytes(self, obj): return n = len(obj) if n <= 0xff: - self.write(SHORT_BINBYTES + pack(" 0xffffffff and self.proto >= 4: - self.write(BINBYTES8 + pack("= 4: - self.write(SHORT_BINUNICODE + pack(" 0xffffffff and self.proto >= 4: - self.write(BINUNICODE8 + pack(" Date: Thu, 9 Nov 2017 19:30:58 +0100 Subject: [PATCH 02/39] Add comment to justify iteration over chunks --- Lib/pickle.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/Lib/pickle.py b/Lib/pickle.py index 42d05f98f2f489..2eee71088e0a57 100644 --- a/Lib/pickle.py +++ b/Lib/pickle.py @@ -228,6 +228,9 @@ def write_many(self, *chunks): else: write = self.write + # Be careful to not concatenate the chunks prior to calling 'write' as + # some chunks (typically the last of the list) can be very large and we + # do not want to allocate a large temporary bytes object. for chunk in chunks: write(chunk) From 964a67f437f23a9f0b4df9aec4b931232ea218b7 Mon Sep 17 00:00:00 2001 From: Olivier Grisel Date: Thu, 9 Nov 2017 19:37:01 +0100 Subject: [PATCH 03/39] Concatenate short bytes --- Lib/pickle.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/Lib/pickle.py b/Lib/pickle.py index 2eee71088e0a57..0fe5eda52761f5 100644 --- a/Lib/pickle.py +++ b/Lib/pickle.py @@ -713,11 +713,11 @@ def save_bytes(self, obj): return n = len(obj) if n <= 0xff: - self._write_many(SHORT_BINBYTES, pack(" 0xffffffff and self.proto >= 4: - self._write_many(BINBYTES8, pack("= 4: - self._write_many(SHORT_BINUNICODE, pack(" 0xffffffff and self.proto >= 4: - self._write_many(BINUNICODE8, pack(" Date: Fri, 10 Nov 2017 00:05:24 +0100 Subject: [PATCH 04/39] Add entry to NEWS --- .../next/Library/2017-11-10-00-05-08.bpo-31993.-OMNg8.rst | 3 +++ 1 file changed, 3 insertions(+) create mode 100644 Misc/NEWS.d/next/Library/2017-11-10-00-05-08.bpo-31993.-OMNg8.rst diff --git a/Misc/NEWS.d/next/Library/2017-11-10-00-05-08.bpo-31993.-OMNg8.rst b/Misc/NEWS.d/next/Library/2017-11-10-00-05-08.bpo-31993.-OMNg8.rst new file mode 100644 index 00000000000000..56d971e07b3230 --- /dev/null +++ b/Misc/NEWS.d/next/Library/2017-11-10-00-05-08.bpo-31993.-OMNg8.rst @@ -0,0 +1,3 @@ +The Python-based ``pickle._Pickler`` does no longer allocate temporary +memory when dumping large ``bytes`` objects. Instead the data is directly +streamed into the underlying file object. From 410d73a92cff29f954bc063c8d75c0943abacb45 Mon Sep 17 00:00:00 2001 From: Olivier Grisel Date: Fri, 10 Nov 2017 14:51:00 +0100 Subject: [PATCH 05/39] Reduce overhead for small bytes objects --- Lib/pickle.py | 31 +++++++++++++++---------------- 1 file changed, 15 insertions(+), 16 deletions(-) diff --git a/Lib/pickle.py b/Lib/pickle.py index 0fe5eda52761f5..3fb47a9fe938bb 100644 --- a/Lib/pickle.py +++ b/Lib/pickle.py @@ -216,23 +216,22 @@ def write(self, data): else: return self.file_write(data) - def write_many(self, *chunks): - total_size = sum(len(c) for c in chunks) - if self.current_frame and total_size >= self._FRAME_SIZE_TARGET: + def write_large(self, header, payload): + if len(payload) >= self._FRAME_SIZE_TARGET and self.current_frame: # Terminate the current frame to write the next frame directly into # the underlying file to skip the unnecessary memory allocations # of a large temporary buffer. self.commit_frame(force=True) write = self.file_write - write(FRAME + pack("= 1 @@ -715,9 +714,9 @@ def save_bytes(self, obj): if n <= 0xff: self.write(SHORT_BINBYTES + pack(" 0xffffffff and self.proto >= 4: - self._write_many(BINBYTES8 + pack("= 4: self.write(SHORT_BINUNICODE + pack(" 0xffffffff and self.proto >= 4: - self._write_many(BINUNICODE8 + pack(" Date: Fri, 10 Nov 2017 15:17:51 +0100 Subject: [PATCH 06/39] Even more overhead reduction for small bytes buffers --- Lib/pickle.py | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/Lib/pickle.py b/Lib/pickle.py index 3fb47a9fe938bb..ff59c6d2e0653e 100644 --- a/Lib/pickle.py +++ b/Lib/pickle.py @@ -216,22 +216,23 @@ def write(self, data): else: return self.file_write(data) - def write_large(self, header, payload): + def write_large_bytes(self, opcode, size_header, payload): if len(payload) >= self._FRAME_SIZE_TARGET and self.current_frame: # Terminate the current frame to write the next frame directly into # the underlying file to skip the unnecessary memory allocations # of a large temporary buffer. self.commit_frame(force=True) write = self.file_write - write(FRAME + pack("= 1 @@ -714,9 +715,9 @@ def save_bytes(self, obj): if n <= 0xff: self.write(SHORT_BINBYTES + pack(" 0xffffffff and self.proto >= 4: - self._write_large(BINBYTES8 + pack("= 4: self.write(SHORT_BINUNICODE + pack(" 0xffffffff and self.proto >= 4: - self._write_large(BINUNICODE8 + pack(" Date: Fri, 10 Nov 2017 15:23:36 +0100 Subject: [PATCH 07/39] Fix no-copy for protocol < 4 --- Lib/pickle.py | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/Lib/pickle.py b/Lib/pickle.py index ff59c6d2e0653e..5b40fe12efcfae 100644 --- a/Lib/pickle.py +++ b/Lib/pickle.py @@ -217,17 +217,18 @@ def write(self, data): return self.file_write(data) def write_large_bytes(self, opcode, size_header, payload): - if len(payload) >= self._FRAME_SIZE_TARGET and self.current_frame: - # Terminate the current frame to write the next frame directly into - # the underlying file to skip the unnecessary memory allocations - # of a large temporary buffer. - self.commit_frame(force=True) + if len(payload) >= self._FRAME_SIZE_TARGET: write = self.file_write - frame_size = len(opcode) + len(size_header) + len(payload) - write(FRAME + pack(" Date: Fri, 10 Nov 2017 15:50:09 +0100 Subject: [PATCH 08/39] More overhead reduction for small bytes --- Lib/pickle.py | 45 +++++++++++++++++++++++---------------------- 1 file changed, 23 insertions(+), 22 deletions(-) diff --git a/Lib/pickle.py b/Lib/pickle.py index 5b40fe12efcfae..d188d83bbc2db8 100644 --- a/Lib/pickle.py +++ b/Lib/pickle.py @@ -216,24 +216,21 @@ def write(self, data): else: return self.file_write(data) - def write_large_bytes(self, opcode, size_header, payload): - if len(payload) >= self._FRAME_SIZE_TARGET: - write = self.file_write - if self.current_frame: - # Terminate the current frame to write the next frame directly - # into the underlying file to skip the unnecessary memory - # allocations of a large temporary buffer. - self.commit_frame(force=True) - frame_size = len(opcode) + len(size_header) + len(payload) - write(FRAME + pack(" 0xffffffff and self.proto >= 4: - self._write_large_bytes(BINBYTES8, pack(" self.framer._FRAME_SIZE_TARGET: + self._write_large_bytes(BINBYTES + pack("= 4: self.write(SHORT_BINUNICODE + pack(" 0xffffffff and self.proto >= 4: - self._write_large_bytes(BINUNICODE8, pack(" self.framer._FRAME_SIZE_TARGET: + self._write_large_bytes(BINUNICODE + pack(" Date: Sat, 11 Nov 2017 00:32:09 +0100 Subject: [PATCH 09/39] Direct write for large bytes in the C-pickler --- Modules/_pickle.c | 61 ++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 53 insertions(+), 8 deletions(-) diff --git a/Modules/_pickle.c b/Modules/_pickle.c index 4b7f1ed66b30e9..50e9591c477f5c 100644 --- a/Modules/_pickle.c +++ b/Modules/_pickle.c @@ -890,7 +890,7 @@ _Pickler_WriteFrameHeader(PicklerObject *self, char *qdata, size_t frame_len) } static int -_Pickler_CommitFrame(PicklerObject *self) +_Pickler_CommitFrame(PicklerObject *self, size_t frame_extension) { size_t frame_len; char *qdata; @@ -898,6 +898,7 @@ _Pickler_CommitFrame(PicklerObject *self) if (!self->framing || self->frame_start == -1) return 0; frame_len = self->output_len - self->frame_start - FRAME_HEADER_SIZE; + frame_len += frame_extension; qdata = PyBytes_AS_STRING(self->output_buffer) + self->frame_start; _Pickler_WriteFrameHeader(self, qdata, frame_len); self->frame_start = -1; @@ -913,7 +914,7 @@ _Pickler_OpcodeBoundary(PicklerObject *self) return 0; frame_len = self->output_len - self->frame_start - FRAME_HEADER_SIZE; if (frame_len >= FRAME_SIZE_TARGET) - return _Pickler_CommitFrame(self); + return _Pickler_CommitFrame(self, 0); else return 0; } @@ -925,7 +926,7 @@ _Pickler_GetString(PicklerObject *self) assert(self->output_buffer != NULL); - if (_Pickler_CommitFrame(self)) + if (_Pickler_CommitFrame(self, 0)) return NULL; self->output_buffer = NULL; @@ -2057,6 +2058,43 @@ save_float(PicklerObject *self, PyObject *obj) return 0; } +static int +_Pickler_write_large_bytes( + PicklerObject *self, const char *header, Py_ssize_t header_size, + PyObject *payload, Py_ssize_t payload_size) +{ + assert(self->output_buffer != NULL); + assert(self->write != NULL); + PyObject *result; + + // Commit the previous frame + if (_Pickler_CommitFrame(self, 0)) + return -1; + + // Create a new frame dedicated to the large bytes + if (_Pickler_Write(self, header, header_size) < 0) + return -1; + if (_Pickler_CommitFrame(self, payload_size)) + return -1; + + // Dump the buffer to the file + if (_Pickler_FlushToFile(self) < 0) + return -1; + + // Reinitialize the buffer for subsequent calls to _Pickler_Write. + if (_Pickler_ClearBuffer(self) < 0) + return -1; + + // Stream write the payload into the file without going through the + // output buffer + result = PyObject_CallFunctionObjArgs(self->write, payload, NULL); + Py_XDECREF(result); + if (result == NULL) + return -1; + + return 0; +} + static int save_bytes(PicklerObject *self, PyObject *obj) { @@ -2135,12 +2173,19 @@ save_bytes(PicklerObject *self, PyObject *obj) return -1; /* string too large */ } - if (_Pickler_Write(self, header, len) < 0) - return -1; - - if (_Pickler_Write(self, PyBytes_AS_STRING(obj), size) < 0) - return -1; + if (size < FRAME_SIZE_TARGET || self->write == NULL) { + if (_Pickler_Write(self, header, len) < 0) + return -1; + if (_Pickler_Write(self, PyBytes_AS_STRING(obj), size) < 0) + return -1; + } + else { + // Bypass the in-memory buffer to directly stream large data + // into the underlying file object + if (_Pickler_write_large_bytes(self, header, len, obj, size) < 0) + return -1; + } if (memo_put(self, obj) < 0) return -1; From e795e66879998abfce523b4859467245d331dd76 Mon Sep 17 00:00:00 2001 From: Olivier Grisel Date: Sat, 11 Nov 2017 16:29:33 +0100 Subject: [PATCH 10/39] Update NEWS to include the C pickler --- .../next/Library/2017-11-10-00-05-08.bpo-31993.-OMNg8.rst | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Misc/NEWS.d/next/Library/2017-11-10-00-05-08.bpo-31993.-OMNg8.rst b/Misc/NEWS.d/next/Library/2017-11-10-00-05-08.bpo-31993.-OMNg8.rst index 56d971e07b3230..9cd05b0ad43865 100644 --- a/Misc/NEWS.d/next/Library/2017-11-10-00-05-08.bpo-31993.-OMNg8.rst +++ b/Misc/NEWS.d/next/Library/2017-11-10-00-05-08.bpo-31993.-OMNg8.rst @@ -1,3 +1,3 @@ -The Python-based ``pickle._Pickler`` does no longer allocate temporary -memory when dumping large ``bytes`` objects. Instead the data is directly -streamed into the underlying file object. +The C-based and Python-based picklers do no longer allocate temporary memory +when dumping large ``bytes`` objects into a file object. Instead the data is +directly streamed into the underlying file object. From bb4d3eb71f987f75aae9c383bcb670a23772fd0a Mon Sep 17 00:00:00 2001 From: Olivier Grisel Date: Sun, 12 Nov 2017 12:18:54 +0100 Subject: [PATCH 11/39] No-copy dump for large unicode in C pickler --- Modules/_pickle.c | 22 +++++++++++++++++----- 1 file changed, 17 insertions(+), 5 deletions(-) diff --git a/Modules/_pickle.c b/Modules/_pickle.c index 50e9591c477f5c..7335079d6ae213 100644 --- a/Modules/_pickle.c +++ b/Modules/_pickle.c @@ -2269,6 +2269,7 @@ write_utf8(PicklerObject *self, const char *data, Py_ssize_t size) { char header[9]; Py_ssize_t len; + PyObject *mem; assert(size >= 0); if (size <= 0xff && self->proto >= 4) { @@ -2295,11 +2296,22 @@ write_utf8(PicklerObject *self, const char *data, Py_ssize_t size) return -1; } - if (_Pickler_Write(self, header, len) < 0) - return -1; - if (_Pickler_Write(self, data, size) < 0) - return -1; - + if (size < FRAME_SIZE_TARGET || self->write == NULL) { + if (_Pickler_Write(self, header, len) < 0) + return -1; + if (_Pickler_Write(self, data, size) < 0) + return -1; + } + else { + // Bypass the in-memory buffer to directly stream large data + // into the underlying file object + mem = PyMemoryView_FromMemory((char *) data, size, PyBUF_READ); + if (mem == NULL) + return -1; + if (_Pickler_write_large_bytes(self, header, len, mem, size) < 0) + return -1; + Py_DECREF(mem); + } return 0; } From 5d52bd735f2bc221826043d8c45a91a5a149ea18 Mon Sep 17 00:00:00 2001 From: Olivier Grisel Date: Sun, 12 Nov 2017 12:22:13 +0100 Subject: [PATCH 12/39] Update NEWS to mention str [ci skip] --- .../next/Library/2017-11-10-00-05-08.bpo-31993.-OMNg8.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Misc/NEWS.d/next/Library/2017-11-10-00-05-08.bpo-31993.-OMNg8.rst b/Misc/NEWS.d/next/Library/2017-11-10-00-05-08.bpo-31993.-OMNg8.rst index 9cd05b0ad43865..27a9de01226ab9 100644 --- a/Misc/NEWS.d/next/Library/2017-11-10-00-05-08.bpo-31993.-OMNg8.rst +++ b/Misc/NEWS.d/next/Library/2017-11-10-00-05-08.bpo-31993.-OMNg8.rst @@ -1,3 +1,3 @@ The C-based and Python-based picklers do no longer allocate temporary memory -when dumping large ``bytes`` objects into a file object. Instead the data is -directly streamed into the underlying file object. +when dumping large ``bytes`` and ``str`` objects into a file object. Instead +the data is directly streamed into the underlying file object. From 054c94b0fb8aee800fbac48afbef2e8d51f439c8 Mon Sep 17 00:00:00 2001 From: Olivier Grisel Date: Sun, 12 Nov 2017 13:35:22 +0100 Subject: [PATCH 13/39] Simpler NEWS entry [ci skip] --- .../next/Library/2017-11-10-00-05-08.bpo-31993.-OMNg8.rst | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Misc/NEWS.d/next/Library/2017-11-10-00-05-08.bpo-31993.-OMNg8.rst b/Misc/NEWS.d/next/Library/2017-11-10-00-05-08.bpo-31993.-OMNg8.rst index 27a9de01226ab9..2f7108a6b5d7ea 100644 --- a/Misc/NEWS.d/next/Library/2017-11-10-00-05-08.bpo-31993.-OMNg8.rst +++ b/Misc/NEWS.d/next/Library/2017-11-10-00-05-08.bpo-31993.-OMNg8.rst @@ -1,3 +1,3 @@ -The C-based and Python-based picklers do no longer allocate temporary memory -when dumping large ``bytes`` and ``str`` objects into a file object. Instead -the data is directly streamed into the underlying file object. +The picklers do no longer allocate temporary memory when dumping large +``bytes`` and ``str`` objects into a file object. Instead the data is +directly streamed into the underlying file object. From 31c3afaf054c066fc14b49417b0db2df17333dbf Mon Sep 17 00:00:00 2001 From: Olivier Grisel Date: Sun, 12 Nov 2017 13:34:52 +0100 Subject: [PATCH 14/39] C-code style fixes --- Modules/_pickle.c | 64 +++++++++++++++++++++++++++++------------------ 1 file changed, 39 insertions(+), 25 deletions(-) diff --git a/Modules/_pickle.c b/Modules/_pickle.c index 7335079d6ae213..ebfc846a4cde91 100644 --- a/Modules/_pickle.c +++ b/Modules/_pickle.c @@ -2058,6 +2058,8 @@ save_float(PicklerObject *self, PyObject *obj) return 0; } +/* No-copy code-path to write large contiguous data directly into the + underlying file object, bypassing the output_buffer of the Pickler. */ static int _Pickler_write_large_bytes( PicklerObject *self, const char *header, Py_ssize_t header_size, @@ -2067,30 +2069,35 @@ _Pickler_write_large_bytes( assert(self->write != NULL); PyObject *result; - // Commit the previous frame - if (_Pickler_CommitFrame(self, 0)) + /* Commit the previous frame. */ + if (_Pickler_CommitFrame(self, 0)) { return -1; + } - // Create a new frame dedicated to the large bytes - if (_Pickler_Write(self, header, header_size) < 0) + /* Create a new frame dedicated to the large bytes. */ + if (_Pickler_Write(self, header, header_size) < 0) { return -1; - if (_Pickler_CommitFrame(self, payload_size)) + } + if (_Pickler_CommitFrame(self, payload_size)) { return -1; - - // Dump the buffer to the file - if (_Pickler_FlushToFile(self) < 0) + } + /* Dump the buffer to the file. */ + if (_Pickler_FlushToFile(self) < 0) { return -1; + } - // Reinitialize the buffer for subsequent calls to _Pickler_Write. - if (_Pickler_ClearBuffer(self) < 0) + /* Reinitialize the buffer for subsequent calls to _Pickler_Write. */ + if (_Pickler_ClearBuffer(self) < 0) { return -1; + } - // Stream write the payload into the file without going through the - // output buffer + /* Stream write the payload into the file without going through the + output buffer. */ result = PyObject_CallFunctionObjArgs(self->write, payload, NULL); Py_XDECREF(result); - if (result == NULL) + if (result == NULL) { return -1; + } return 0; } @@ -2174,18 +2181,21 @@ save_bytes(PicklerObject *self, PyObject *obj) } if (size < FRAME_SIZE_TARGET || self->write == NULL) { - if (_Pickler_Write(self, header, len) < 0) + if (_Pickler_Write(self, header, len) < 0) { return -1; - - if (_Pickler_Write(self, PyBytes_AS_STRING(obj), size) < 0) + } + if (_Pickler_Write(self, PyBytes_AS_STRING(obj), size) < 0) { return -1; + } } else { - // Bypass the in-memory buffer to directly stream large data - // into the underlying file object - if (_Pickler_write_large_bytes(self, header, len, obj, size) < 0) + /* Bypass the in-memory buffer to directly stream large data + into the underlying file object. */ + if (_Pickler_write_large_bytes(self, header, len, obj, size) < 0) { return -1; + } } + if (memo_put(self, obj) < 0) return -1; @@ -2297,19 +2307,23 @@ write_utf8(PicklerObject *self, const char *data, Py_ssize_t size) } if (size < FRAME_SIZE_TARGET || self->write == NULL) { - if (_Pickler_Write(self, header, len) < 0) + if (_Pickler_Write(self, header, len) < 0) { return -1; - if (_Pickler_Write(self, data, size) < 0) + } + if (_Pickler_Write(self, data, size) < 0) { return -1; + } } else { - // Bypass the in-memory buffer to directly stream large data - // into the underlying file object + /* Bypass the in-memory buffer to directly stream large data + into the underlying file object. */ mem = PyMemoryView_FromMemory((char *) data, size, PyBUF_READ); - if (mem == NULL) + if (mem == NULL) { return -1; - if (_Pickler_write_large_bytes(self, header, len, mem, size) < 0) + } + if (_Pickler_write_large_bytes(self, header, len, mem, size) < 0) { return -1; + } Py_DECREF(mem); } return 0; From dcce63a276a1afd2bc5b4610af34d0825a50546a Mon Sep 17 00:00:00 2001 From: Olivier Grisel Date: Sun, 12 Nov 2017 17:06:31 +0100 Subject: [PATCH 15/39] Fixed leak --- Modules/_pickle.c | 1 + 1 file changed, 1 insertion(+) diff --git a/Modules/_pickle.c b/Modules/_pickle.c index ebfc846a4cde91..e34778d9136930 100644 --- a/Modules/_pickle.c +++ b/Modules/_pickle.c @@ -2322,6 +2322,7 @@ write_utf8(PicklerObject *self, const char *data, Py_ssize_t size) return -1; } if (_Pickler_write_large_bytes(self, header, len, mem, size) < 0) { + Py_DECREF(mem); return -1; } Py_DECREF(mem); From d233208785ab52834aa696efd42925f5d73a8bf8 Mon Sep 17 00:00:00 2001 From: Olivier Grisel Date: Sun, 12 Nov 2017 19:45:38 +0100 Subject: [PATCH 16/39] Do not wrap large objects in frames --- Lib/pickle.py | 19 +++++++------- Lib/test/pickletester.py | 55 ++++++++++++++++++++++++++++++---------- Modules/_pickle.c | 21 +++++++-------- 3 files changed, 62 insertions(+), 33 deletions(-) diff --git a/Lib/pickle.py b/Lib/pickle.py index d188d83bbc2db8..1608273ea66f99 100644 --- a/Lib/pickle.py +++ b/Lib/pickle.py @@ -219,16 +219,15 @@ def write(self, data): def write_large_bytes(self, header, payload): write = self.file_write if self.current_frame: - # Terminate the current frame to write the next frame directly - # into the underlying file to skip the unnecessary memory - # allocations of a large temporary buffer. + # Terminate the current frame and flush it to the file. self.commit_frame(force=True) - frame_size = len(header) + len(payload) - write(FRAME + pack(" 0xffffffff and self.proto >= 4: self._write_large_bytes(BINBYTES8 + pack(" self.framer._FRAME_SIZE_TARGET: + elif n >= self.framer._FRAME_SIZE_TARGET: self._write_large_bytes(BINBYTES + pack(" 0xffffffff and self.proto >= 4: self._write_large_bytes(BINUNICODE8 + pack(" self.framer._FRAME_SIZE_TARGET: + elif n >= self.framer._FRAME_SIZE_TARGET: self._write_large_bytes(BINUNICODE + pack(" self.FRAME_SIZE_TARGET: + frame_opcode_size = frameless_opcode_sizes[op.name] + arg = len(arg) + else: + continue + elif op.name == 'FRAME': + frame_opcode_size = 9 + else: continue + if last_pos is not None: # The previous frame's size should be equal to the number # of bytes up to the current frame. - frame_size = pos - last_pos - frame_opcode_size + frame_size = pos - last_pos - last_frame_opcode_size self.assertEqual(frame_size, last_arg) last_arg, last_pos = arg, pos + last_frame_opcode_size = frame_opcode_size # The last frame's size should be equal to the number of bytes up # to the pickle's end. - frame_size = len(pickled) - last_pos - frame_opcode_size + frame_size = len(pickled) - last_pos - last_frame_opcode_size self.assertEqual(frame_size, last_arg) def test_framing_many_objects(self): @@ -2076,15 +2095,25 @@ def test_framing_many_objects(self): def test_framing_large_objects(self): N = 1024 * 1024 - obj = [b'x' * N, b'y' * N, b'z' * N] + obj = [b'x' * N, b'y' * N, 'z' * N] for proto in range(4, pickle.HIGHEST_PROTOCOL + 1): - with self.subTest(proto=proto): - pickled = self.dumps(obj, proto) - unpickled = self.loads(pickled) - self.assertEqual(obj, unpickled) - n_frames = count_opcode(pickle.FRAME, pickled) - self.assertGreaterEqual(n_frames, len(obj)) - self.check_frame_opcodes(pickled) + for fast in [True, False]: + with self.subTest(proto=proto, fast=fast): + buf = io.BytesIO() + pickler = self.pickler(buf, protocol=proto) + pickler.fast = fast + pickler.dump(obj) + pickled = buf.getvalue() + unpickled = self.loads(pickled) + self.assertEqual(obj, unpickled) + n_frames = count_opcode(pickle.FRAME, pickled) + if not fast: + # One frame per memoize for each large object. + self.assertGreaterEqual(n_frames, len(obj)) + else: + # One frame at the beginning and one at the end. + self.assertGreaterEqual(n_frames, 2) + self.check_frame_opcodes(pickled) def test_optional_frames(self): if pickle.HIGHEST_PROTOCOL < 4: diff --git a/Modules/_pickle.c b/Modules/_pickle.c index e34778d9136930..05894ab884f1b8 100644 --- a/Modules/_pickle.c +++ b/Modules/_pickle.c @@ -2073,24 +2073,17 @@ _Pickler_write_large_bytes( if (_Pickler_CommitFrame(self, 0)) { return -1; } + /* Disable frameing temporarily */ + self->framing = 0; - /* Create a new frame dedicated to the large bytes. */ if (_Pickler_Write(self, header, header_size) < 0) { return -1; } - if (_Pickler_CommitFrame(self, payload_size)) { - return -1; - } - /* Dump the buffer to the file. */ + /* Dump the output buffer to the file. */ if (_Pickler_FlushToFile(self) < 0) { return -1; } - /* Reinitialize the buffer for subsequent calls to _Pickler_Write. */ - if (_Pickler_ClearBuffer(self) < 0) { - return -1; - } - /* Stream write the payload into the file without going through the output buffer. */ result = PyObject_CallFunctionObjArgs(self->write, payload, NULL); @@ -2099,6 +2092,14 @@ _Pickler_write_large_bytes( return -1; } + /* Reinitialize the buffer for subsequent calls to _Pickler_Write. */ + if (_Pickler_ClearBuffer(self) < 0) { + return -1; + } + + /* Re-enable framing for subsequent calls to _Pickler_Write. */ + self->framing = 1; + return 0; } From dfe6314ae594bbbf2a13a511b28f12b8751fcef5 Mon Sep 17 00:00:00 2001 From: Olivier Grisel Date: Sun, 12 Nov 2017 19:49:59 +0100 Subject: [PATCH 17/39] Fix bug in function call result handling --- Modules/_pickle.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Modules/_pickle.c b/Modules/_pickle.c index 05894ab884f1b8..717c3820822af3 100644 --- a/Modules/_pickle.c +++ b/Modules/_pickle.c @@ -2087,10 +2087,10 @@ _Pickler_write_large_bytes( /* Stream write the payload into the file without going through the output buffer. */ result = PyObject_CallFunctionObjArgs(self->write, payload, NULL); - Py_XDECREF(result); if (result == NULL) { return -1; } + Py_DECREF(result); /* Reinitialize the buffer for subsequent calls to _Pickler_Write. */ if (_Pickler_ClearBuffer(self) < 0) { From 61611b28220fed436d98446bf6451e45bfca1a21 Mon Sep 17 00:00:00 2001 From: Olivier Grisel Date: Sun, 12 Nov 2017 22:03:47 +0100 Subject: [PATCH 18/39] Fix frameless blobs test for pickletools.optimize --- Lib/test/pickletester.py | 34 ++++++++++++++++++++++++---------- Lib/test/test_pickletools.py | 4 ++++ 2 files changed, 28 insertions(+), 10 deletions(-) diff --git a/Lib/test/pickletester.py b/Lib/test/pickletester.py index 92bec44f34e53b..d2184007e2c2cd 100644 --- a/Lib/test/pickletester.py +++ b/Lib/test/pickletester.py @@ -2039,13 +2039,13 @@ def test_setitems_on_non_dicts(self): FRAME_SIZE_TARGET = 64 * 1024 - def check_frame_opcodes(self, pickled): + def check_frame_opcodes(self, pickled, frameless_blobs=True): """ Check the arguments of FRAME opcodes in a protocol 4+ pickle. Note that binary objects that are larger than FRAME_SIZE_TARGET are not - framed and are therefore considered a frame by themselves in the - following consistency check. + framed by default and are therefore considered a frame by themselves in + the following consistency check. """ last_arg = last_pos = last_frame_opcode_size = None frameless_opcode_sizes = { @@ -2055,7 +2055,7 @@ def check_frame_opcodes(self, pickled): 'BINUNICODE8': 9, } for op, arg, pos in pickletools.genops(pickled): - if op.name in frameless_opcode_sizes: + if frameless_blobs and op.name in frameless_opcode_sizes: if len(arg) > self.FRAME_SIZE_TARGET: frame_opcode_size = frameless_opcode_sizes[op.name] arg = len(arg) @@ -2098,12 +2098,26 @@ def test_framing_large_objects(self): obj = [b'x' * N, b'y' * N, 'z' * N] for proto in range(4, pickle.HIGHEST_PROTOCOL + 1): for fast in [True, False]: + if fast and not hasattr(self, 'pickler'): + continue + + # The default picklers do not include large binary objects in + # frames. + # However some alternative implementation of dump / dumps can + # output pickles that include large binary objects inside + # protocol 4 frames. The test classes of such implementations + # make it explicit by setting the following flag. + frameless_blobs = getattr(self, 'frameless_blobs', True) + with self.subTest(proto=proto, fast=fast): - buf = io.BytesIO() - pickler = self.pickler(buf, protocol=proto) - pickler.fast = fast - pickler.dump(obj) - pickled = buf.getvalue() + if hasattr(self, 'pickler'): + buf = io.BytesIO() + pickler = self.pickler(buf, protocol=proto) + pickler.fast = fast + pickler.dump(obj) + pickled = buf.getvalue() + else: + pickled = self.dumps(obj, proto) unpickled = self.loads(pickled) self.assertEqual(obj, unpickled) n_frames = count_opcode(pickle.FRAME, pickled) @@ -2113,7 +2127,7 @@ def test_framing_large_objects(self): else: # One frame at the beginning and one at the end. self.assertGreaterEqual(n_frames, 2) - self.check_frame_opcodes(pickled) + self.check_frame_opcodes(pickled, frameless_blobs) def test_optional_frames(self): if pickle.HIGHEST_PROTOCOL < 4: diff --git a/Lib/test/test_pickletools.py b/Lib/test/test_pickletools.py index 86bebfa0266862..890aaf31a44311 100644 --- a/Lib/test/test_pickletools.py +++ b/Lib/test/test_pickletools.py @@ -7,6 +7,10 @@ class OptimizedPickleTests(AbstractPickleTests, AbstractPickleModuleTests): + # pickletools.optimize only works in-memory on pickle strings: it + # therefore is ok to include large objects inside a large frame. + frameless_blobs = False + def dumps(self, arg, proto=None): return pickletools.optimize(pickle.dumps(arg, proto)) From e67df814eb18a003a81e11000c0173db1c557076 Mon Sep 17 00:00:00 2001 From: Olivier Grisel Date: Mon, 13 Nov 2017 00:03:47 +0100 Subject: [PATCH 19/39] Flush to file after each frame commit --- Lib/pickle.py | 24 +++++++++++++++++------ Lib/test/pickletester.py | 31 ++++++++++++++++++++++++++++++ Modules/_pickle.c | 41 ++++++++++++++++++++++++++-------------- 3 files changed, 76 insertions(+), 20 deletions(-) diff --git a/Lib/pickle.py b/Lib/pickle.py index 1608273ea66f99..2cda615b328150 100644 --- a/Lib/pickle.py +++ b/Lib/pickle.py @@ -188,6 +188,7 @@ class _Framer: def __init__(self, file_write): self.file_write = file_write self.current_frame = None + self.delayed_proto_opcode = None def start_framing(self): self.current_frame = io.BytesIO() @@ -203,13 +204,24 @@ def commit_frame(self, force=False): if f.tell() >= self._FRAME_SIZE_TARGET or force: with f.getbuffer() as data: n = len(data) - write = self.file_write - write(FRAME) - write(pack("= 2: - self.write(PROTO + pack("= 4: self.framer.start_framing() + if self.proto >= 2: + self.framer.write_proto(PROTO + pack("framing || self->frame_start == -1) - return 0; - frame_len = self->output_len - self->frame_start - FRAME_HEADER_SIZE; - if (frame_len >= FRAME_SIZE_TARGET) - return _Pickler_CommitFrame(self, 0); - else - return 0; -} - static PyObject * _Pickler_GetString(PicklerObject *self) { @@ -953,6 +939,33 @@ _Pickler_FlushToFile(PicklerObject *self) return (result == NULL) ? -1 : 0; } +static int +_Pickler_OpcodeBoundary(PicklerObject *self) +{ + Py_ssize_t frame_len; + + if (!self->framing || self->frame_start == -1) + return 0; + frame_len = self->output_len - self->frame_start - FRAME_HEADER_SIZE; + if (frame_len >= FRAME_SIZE_TARGET) { + if(_Pickler_CommitFrame(self, 0)) { + return -1; + } + if (self->write != NULL) { + if (_Pickler_FlushToFile(self) < 0) { + return -1; + } + if (_Pickler_ClearBuffer(self) < 0) { + return -1; + } + } + return 0; + } + else { + return 0; + } +} + static Py_ssize_t _Pickler_Write(PicklerObject *self, const char *s, Py_ssize_t data_len) { From d1e9a7d30fb5f5f7c9368b533d0fa6ee10acbee2 Mon Sep 17 00:00:00 2001 From: Olivier Grisel Date: Mon, 13 Nov 2017 00:33:41 +0100 Subject: [PATCH 20/39] Extend NEWS entry to mention write-on-frame-commit --- .../next/Library/2017-11-10-00-05-08.bpo-31993.-OMNg8.rst | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/Misc/NEWS.d/next/Library/2017-11-10-00-05-08.bpo-31993.-OMNg8.rst b/Misc/NEWS.d/next/Library/2017-11-10-00-05-08.bpo-31993.-OMNg8.rst index 2f7108a6b5d7ea..595826ae5f54a5 100644 --- a/Misc/NEWS.d/next/Library/2017-11-10-00-05-08.bpo-31993.-OMNg8.rst +++ b/Misc/NEWS.d/next/Library/2017-11-10-00-05-08.bpo-31993.-OMNg8.rst @@ -1,3 +1,7 @@ The picklers do no longer allocate temporary memory when dumping large ``bytes`` and ``str`` objects into a file object. Instead the data is directly streamed into the underlying file object. + +Previously the C implementation would buffer all content and issue a +single call to ``file.write`` at the end of the dump. With protocol 4 +this behavior has changed to issue one call to ``file.write`` per frame. From 7f08831e3f70adea54e4b231ad8024c834e6d543 Mon Sep 17 00:00:00 2001 From: Olivier Grisel Date: Fri, 17 Nov 2017 13:31:07 +0100 Subject: [PATCH 21/39] Make proto 4 Python pickler issue (2 * n_frames + 1) calls to write --- Lib/pickle.py | 23 ++++------------------- Lib/test/pickletester.py | 39 +++++++++++++++++++++++++++++++-------- 2 files changed, 35 insertions(+), 27 deletions(-) diff --git a/Lib/pickle.py b/Lib/pickle.py index 2cda615b328150..30aab33da0e10c 100644 --- a/Lib/pickle.py +++ b/Lib/pickle.py @@ -188,7 +188,6 @@ class _Framer: def __init__(self, file_write): self.file_write = file_write self.current_frame = None - self.delayed_proto_opcode = None def start_framing(self): self.current_frame = io.BytesIO() @@ -203,25 +202,11 @@ def commit_frame(self, force=False): f = self.current_frame if f.tell() >= self._FRAME_SIZE_TARGET or force: with f.getbuffer() as data: - n = len(data) - po = self.delayed_proto_opcode - if po is None: - self.file_write(FRAME + pack("= 2: + self.write(PROTO + pack("= 4: self.framer.start_framing() - if self.proto >= 2: - self.framer.write_proto(PROTO + pack(" 0: + n_frames += 1 + + # There should be at least one call to write per frame + self.assertGreaterEqual(len(w.chunks), n_frames) + + # but not too many either: there can be one for the proto, + # one per-frame header and one per frame for the actual contents. + self.assertGreaterEqual(2 * n_frames + 1, len(w.chunks)) + + chunk_sizes = [len(c) for c in w.chunks[:-1]] + large_sizes = [s for s in chunk_sizes + if s >= self.FRAME_SIZE_TARGET] + small_sizes = [s for s in chunk_sizes + if s < self.FRAME_SIZE_TARGET] + + # Large chunks should not be too large: + for chunk_size in large_sizes: + self.assertGreater(2 * self.FRAME_SIZE_TARGET, chunk_size) + + last_chunk_size = len(w.chunks[-1]) + self.assertGreater(2 * self.FRAME_SIZE_TARGET, last_chunk_size) + + # Small chunks (if any) should be very small + # (only proto and frame headers) + for chunk_size in small_sizes: + self.assertGreaterEqual(9, chunk_size) def test_nested_names(self): global Nested From 750ae86fb625af3884b60f34e5dd6d29e841acbd Mon Sep 17 00:00:00 2001 From: Olivier Grisel Date: Fri, 17 Nov 2017 15:24:37 +0100 Subject: [PATCH 22/39] Cleanup _Pickler_CommitFrame --- Modules/_pickle.c | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/Modules/_pickle.c b/Modules/_pickle.c index efb365119e4e83..0b3cd8eb49379c 100644 --- a/Modules/_pickle.c +++ b/Modules/_pickle.c @@ -890,7 +890,7 @@ _Pickler_WriteFrameHeader(PicklerObject *self, char *qdata, size_t frame_len) } static int -_Pickler_CommitFrame(PicklerObject *self, size_t frame_extension) +_Pickler_CommitFrame(PicklerObject *self) { size_t frame_len; char *qdata; @@ -898,7 +898,6 @@ _Pickler_CommitFrame(PicklerObject *self, size_t frame_extension) if (!self->framing || self->frame_start == -1) return 0; frame_len = self->output_len - self->frame_start - FRAME_HEADER_SIZE; - frame_len += frame_extension; qdata = PyBytes_AS_STRING(self->output_buffer) + self->frame_start; _Pickler_WriteFrameHeader(self, qdata, frame_len); self->frame_start = -1; @@ -912,7 +911,7 @@ _Pickler_GetString(PicklerObject *self) assert(self->output_buffer != NULL); - if (_Pickler_CommitFrame(self, 0)) + if (_Pickler_CommitFrame(self)) return NULL; self->output_buffer = NULL; @@ -948,7 +947,7 @@ _Pickler_OpcodeBoundary(PicklerObject *self) return 0; frame_len = self->output_len - self->frame_start - FRAME_HEADER_SIZE; if (frame_len >= FRAME_SIZE_TARGET) { - if(_Pickler_CommitFrame(self, 0)) { + if(_Pickler_CommitFrame(self)) { return -1; } if (self->write != NULL) { @@ -959,11 +958,8 @@ _Pickler_OpcodeBoundary(PicklerObject *self) return -1; } } - return 0; - } - else { - return 0; } + return 0; } static Py_ssize_t @@ -2083,7 +2079,7 @@ _Pickler_write_large_bytes( PyObject *result; /* Commit the previous frame. */ - if (_Pickler_CommitFrame(self, 0)) { + if (_Pickler_CommitFrame(self)) { return -1; } /* Disable frameing temporarily */ From c0c89732e0ad694950546a02f3cbfc03de7dfc26 Mon Sep 17 00:00:00 2001 From: Olivier Grisel Date: Fri, 17 Nov 2017 16:09:51 +0100 Subject: [PATCH 23/39] Get rid of the frameless_blobs=False exception: actually it works out of the box --- Lib/test/pickletester.py | 16 +++++----------- Lib/test/test_pickletools.py | 4 ---- 2 files changed, 5 insertions(+), 15 deletions(-) diff --git a/Lib/test/pickletester.py b/Lib/test/pickletester.py index 04eec59635af80..e40430b0c33859 100644 --- a/Lib/test/pickletester.py +++ b/Lib/test/pickletester.py @@ -2039,7 +2039,7 @@ def test_setitems_on_non_dicts(self): FRAME_SIZE_TARGET = 64 * 1024 - def check_frame_opcodes(self, pickled, frameless_blobs=True): + def check_frame_opcodes(self, pickled): """ Check the arguments of FRAME opcodes in a protocol 4+ pickle. @@ -2055,7 +2055,7 @@ def check_frame_opcodes(self, pickled, frameless_blobs=True): 'BINUNICODE8': 9, } for op, arg, pos in pickletools.genops(pickled): - if frameless_blobs and op.name in frameless_opcode_sizes: + if op.name in frameless_opcode_sizes: if len(arg) > self.FRAME_SIZE_TARGET: frame_opcode_size = frameless_opcode_sizes[op.name] arg = len(arg) @@ -2099,16 +2099,10 @@ def test_framing_large_objects(self): for proto in range(4, pickle.HIGHEST_PROTOCOL + 1): for fast in [True, False]: if fast and not hasattr(self, 'pickler'): + # The fast flag cannot be changed for test classes that + # only expose the `self.dumps` method. continue - # The default picklers do not include large binary objects in - # frames. - # However some alternative implementation of dump / dumps can - # output pickles that include large binary objects inside - # protocol 4 frames. The test classes of such implementations - # make it explicit by setting the following flag. - frameless_blobs = getattr(self, 'frameless_blobs', True) - with self.subTest(proto=proto, fast=fast): if hasattr(self, 'pickler'): buf = io.BytesIO() @@ -2127,7 +2121,7 @@ def test_framing_large_objects(self): else: # One frame at the beginning and one at the end. self.assertGreaterEqual(n_frames, 2) - self.check_frame_opcodes(pickled, frameless_blobs) + self.check_frame_opcodes(pickled) def test_optional_frames(self): if pickle.HIGHEST_PROTOCOL < 4: diff --git a/Lib/test/test_pickletools.py b/Lib/test/test_pickletools.py index 890aaf31a44311..86bebfa0266862 100644 --- a/Lib/test/test_pickletools.py +++ b/Lib/test/test_pickletools.py @@ -7,10 +7,6 @@ class OptimizedPickleTests(AbstractPickleTests, AbstractPickleModuleTests): - # pickletools.optimize only works in-memory on pickle strings: it - # therefore is ok to include large objects inside a large frame. - frameless_blobs = False - def dumps(self, arg, proto=None): return pickletools.optimize(pickle.dumps(arg, proto)) From 304571acc66fb2be8c5a464370d1431bbe57aab4 Mon Sep 17 00:00:00 2001 From: Olivier Grisel Date: Fri, 17 Nov 2017 17:06:33 +0100 Subject: [PATCH 24/39] Implement frameless blobs for pickletools.optimize --- Lib/pickletools.py | 17 +++++++++++------ Lib/test/pickletester.py | 4 ++++ 2 files changed, 15 insertions(+), 6 deletions(-) diff --git a/Lib/pickletools.py b/Lib/pickletools.py index 0c8dddc10bbc7e..d43cf7bba4f504 100644 --- a/Lib/pickletools.py +++ b/Lib/pickletools.py @@ -2251,7 +2251,7 @@ def genops(pickle): ############################################################################## # A pickle optimizer. -def optimize(p): +def optimize(pickled): 'Optimize a pickle string by removing unused PUT opcodes' put = 'PUT' get = 'GET' @@ -2260,7 +2260,7 @@ def optimize(p): opcodes = [] # (op, idx) or (pos, end_pos) proto = 0 protoheader = b'' - for opcode, arg, pos, end_pos in _genops(p, yield_end_pos=True): + for opcode, arg, pos, end_pos in _genops(pickled, yield_end_pos=True): if 'PUT' in opcode.name: oldids.add(arg) opcodes.append((put, arg)) @@ -2279,7 +2279,7 @@ def optimize(p): if arg > proto: proto = arg if pos == 0: - protoheader = p[pos: end_pos] + protoheader = pickled[pos:end_pos] else: opcodes.append((pos, end_pos)) else: @@ -2295,6 +2295,7 @@ def optimize(p): pickler.framer.start_framing() idx = 0 for op, arg in opcodes: + frameless = False if op is put: if arg not in newids: continue @@ -2304,9 +2305,13 @@ def optimize(p): elif op is get: data = pickler.get(newids[arg]) else: - data = p[op:arg] - pickler.framer.commit_frame() - pickler.write(data) + data = pickled[op:arg] + frameless = len(data) > pickler.framer._FRAME_SIZE_TARGET + pickler.framer.commit_frame(force=frameless) + if frameless: + pickler.framer.file_write(data) + else: + pickler.write(data) pickler.framer.end_framing() return out.getvalue() diff --git a/Lib/test/pickletester.py b/Lib/test/pickletester.py index e40430b0c33859..2b77abff00fde9 100644 --- a/Lib/test/pickletester.py +++ b/Lib/test/pickletester.py @@ -2113,6 +2113,10 @@ def test_framing_large_objects(self): else: pickled = self.dumps(obj, proto) unpickled = self.loads(pickled) + # More informative error message in case of failure. + self.assertEqual([len(x) for x in obj], + [len(x) for x in unpickled]) + # Perform full equality check if the lengths match. self.assertEqual(obj, unpickled) n_frames = count_opcode(pickle.FRAME, pickled) if not fast: From 4ee2ee914355d0761e15f37380e7a7f0f06a6ffe Mon Sep 17 00:00:00 2001 From: Olivier Grisel Date: Fri, 17 Nov 2017 17:10:45 +0100 Subject: [PATCH 25/39] Do not skip test silently if self.pickler is renamed --- Lib/test/pickletester.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/Lib/test/pickletester.py b/Lib/test/pickletester.py index 2b77abff00fde9..1eb831b43a048b 100644 --- a/Lib/test/pickletester.py +++ b/Lib/test/pickletester.py @@ -2098,11 +2098,6 @@ def test_framing_large_objects(self): obj = [b'x' * N, b'y' * N, 'z' * N] for proto in range(4, pickle.HIGHEST_PROTOCOL + 1): for fast in [True, False]: - if fast and not hasattr(self, 'pickler'): - # The fast flag cannot be changed for test classes that - # only expose the `self.dumps` method. - continue - with self.subTest(proto=proto, fast=fast): if hasattr(self, 'pickler'): buf = io.BytesIO() @@ -2111,6 +2106,8 @@ def test_framing_large_objects(self): pickler.dump(obj) pickled = buf.getvalue() else: + # Note that we cannot set the fast flag to fast flag + # to True in this case. pickled = self.dumps(obj, proto) unpickled = self.loads(pickled) # More informative error message in case of failure. From b4c978b6985d663353a5e5a84a056bdaad4c01bc Mon Sep 17 00:00:00 2001 From: Olivier Grisel Date: Fri, 17 Nov 2017 17:14:01 +0100 Subject: [PATCH 26/39] Remove on attribute lookup --- Lib/pickle.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/Lib/pickle.py b/Lib/pickle.py index 30aab33da0e10c..5585d51ddeb8de 100644 --- a/Lib/pickle.py +++ b/Lib/pickle.py @@ -202,8 +202,9 @@ def commit_frame(self, force=False): f = self.current_frame if f.tell() >= self._FRAME_SIZE_TARGET or force: with f.getbuffer() as data: - self.file_write(FRAME + pack(" Date: Fri, 17 Nov 2017 17:46:26 +0100 Subject: [PATCH 27/39] Typo in comment --- Lib/test/pickletester.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Lib/test/pickletester.py b/Lib/test/pickletester.py index 1eb831b43a048b..6460716c843ac7 100644 --- a/Lib/test/pickletester.py +++ b/Lib/test/pickletester.py @@ -2106,8 +2106,8 @@ def test_framing_large_objects(self): pickler.dump(obj) pickled = buf.getvalue() else: - # Note that we cannot set the fast flag to fast flag - # to True in this case. + # Note that we cannot set the fast flag to True in + # this case. pickled = self.dumps(obj, proto) unpickled = self.loads(pickled) # More informative error message in case of failure. From 8181630c8dfddd2e5c0c3888ef780743d73145f1 Mon Sep 17 00:00:00 2001 From: Olivier Grisel Date: Fri, 17 Nov 2017 19:08:46 +0100 Subject: [PATCH 28/39] Add comments for tests that require self.pickler --- Lib/test/pickletester.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/Lib/test/pickletester.py b/Lib/test/pickletester.py index 6460716c843ac7..5ced1c72225748 100644 --- a/Lib/test/pickletester.py +++ b/Lib/test/pickletester.py @@ -2105,9 +2105,11 @@ def test_framing_large_objects(self): pickler.fast = fast pickler.dump(obj) pickled = buf.getvalue() + elif fast: + continue else: - # Note that we cannot set the fast flag to True in - # this case. + # Fallback to self.dumps when fast=False and + # self.pickler is not available. pickled = self.dumps(obj, proto) unpickled = self.loads(pickled) # More informative error message in case of failure. @@ -2165,8 +2167,9 @@ def remove_frames(pickled, keep_frame=None): def test_framed_write_sizes(self): if not hasattr(self, 'pickler'): + # This test requires passing a custom file-object to measure + # the number of calls to file.write and the size of each chunk. return - class Writer: def __init__(self): From 25e4e047d9c56e9cbcdc0f56ef044f4cc2e3e15a Mon Sep 17 00:00:00 2001 From: Olivier Grisel Date: Fri, 24 Nov 2017 17:36:01 +0100 Subject: [PATCH 29/39] revert variable renaming --- Lib/pickletools.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/Lib/pickletools.py b/Lib/pickletools.py index d43cf7bba4f504..534d3a02d73ac9 100644 --- a/Lib/pickletools.py +++ b/Lib/pickletools.py @@ -2251,7 +2251,7 @@ def genops(pickle): ############################################################################## # A pickle optimizer. -def optimize(pickled): +def optimize(p): 'Optimize a pickle string by removing unused PUT opcodes' put = 'PUT' get = 'GET' @@ -2260,7 +2260,7 @@ def optimize(pickled): opcodes = [] # (op, idx) or (pos, end_pos) proto = 0 protoheader = b'' - for opcode, arg, pos, end_pos in _genops(pickled, yield_end_pos=True): + for opcode, arg, pos, end_pos in _genops(p, yield_end_pos=True): if 'PUT' in opcode.name: oldids.add(arg) opcodes.append((put, arg)) @@ -2279,7 +2279,7 @@ def optimize(pickled): if arg > proto: proto = arg if pos == 0: - protoheader = pickled[pos:end_pos] + protoheader = p[pos:end_pos] else: opcodes.append((pos, end_pos)) else: @@ -2305,7 +2305,7 @@ def optimize(pickled): elif op is get: data = pickler.get(newids[arg]) else: - data = pickled[op:arg] + data = p[op:arg] frameless = len(data) > pickler.framer._FRAME_SIZE_TARGET pickler.framer.commit_frame(force=frameless) if frameless: From b57ef9de78f4d3d56aa8c6fb229a80411d5eca7e Mon Sep 17 00:00:00 2001 From: Olivier Grisel Date: Fri, 24 Nov 2017 17:37:14 +0100 Subject: [PATCH 30/39] Disable test_framed_write_sizes in OptimizedPickleTests --- Lib/test/pickletester.py | 4 ---- Lib/test/test_pickletools.py | 3 +++ 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/Lib/test/pickletester.py b/Lib/test/pickletester.py index 5ced1c72225748..7092bd02efc825 100644 --- a/Lib/test/pickletester.py +++ b/Lib/test/pickletester.py @@ -2166,10 +2166,6 @@ def remove_frames(pickled, keep_frame=None): self.assertEqual(obj, self.loads(some_frames_pickle)) def test_framed_write_sizes(self): - if not hasattr(self, 'pickler'): - # This test requires passing a custom file-object to measure - # the number of calls to file.write and the size of each chunk. - return class Writer: def __init__(self): diff --git a/Lib/test/test_pickletools.py b/Lib/test/test_pickletools.py index 86bebfa0266862..4b5e4ecb39a2c8 100644 --- a/Lib/test/test_pickletools.py +++ b/Lib/test/test_pickletools.py @@ -16,6 +16,9 @@ def loads(self, buf, **kwds): # Test relies on precise output of dumps() test_pickle_to_2x = None + # Test relies on writing by chunks into a file object. + test_framed_write_sizes = None + def test_optimize_long_binget(self): data = [str(i) for i in range(257)] data.append(data[-1]) From 727b5c2ffd4b0979ff9e614d34dbd99d452deb40 Mon Sep 17 00:00:00 2001 From: Olivier Grisel Date: Fri, 24 Nov 2017 17:46:09 +0100 Subject: [PATCH 31/39] Small code style fix --- Modules/_pickle.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/Modules/_pickle.c b/Modules/_pickle.c index 0b3cd8eb49379c..104c99aca7302b 100644 --- a/Modules/_pickle.c +++ b/Modules/_pickle.c @@ -943,8 +943,9 @@ _Pickler_OpcodeBoundary(PicklerObject *self) { Py_ssize_t frame_len; - if (!self->framing || self->frame_start == -1) + if (!self->framing || self->frame_start == -1) { return 0; + } frame_len = self->output_len - self->frame_start - FRAME_HEADER_SIZE; if (frame_len >= FRAME_SIZE_TARGET) { if(_Pickler_CommitFrame(self)) { From 8af932164f54562743c023e44b57c0cd0b8f86f0 Mon Sep 17 00:00:00 2001 From: Olivier Grisel Date: Mon, 27 Nov 2017 15:03:28 +0100 Subject: [PATCH 32/39] Add comments to explain tradeoffs in Python pickle frame commit --- Lib/pickle.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/Lib/pickle.py b/Lib/pickle.py index 5585d51ddeb8de..1d4f4d9709861c 100644 --- a/Lib/pickle.py +++ b/Lib/pickle.py @@ -203,7 +203,15 @@ def commit_frame(self, force=False): if f.tell() >= self._FRAME_SIZE_TARGET or force: with f.getbuffer() as data: write = self.file_write + # Issue a single call to the write nethod of the underlying + # file object for the frame opcode with the size of the + # frame. The concatenation is expected to be less expensive + # than issuing an additional call to write. write(FRAME + pack(" Date: Sun, 3 Dec 2017 22:51:00 +0100 Subject: [PATCH 33/39] Implement no-copy support for delayed writers in Python pickler --- Lib/pickle.py | 32 ++++++++++++++++++-------------- Lib/test/pickletester.py | 38 ++++++++++++++++++++++++++------------ 2 files changed, 44 insertions(+), 26 deletions(-) diff --git a/Lib/pickle.py b/Lib/pickle.py index 1d4f4d9709861c..2b5f98ceb22f20 100644 --- a/Lib/pickle.py +++ b/Lib/pickle.py @@ -201,20 +201,24 @@ def commit_frame(self, force=False): if self.current_frame: f = self.current_frame if f.tell() >= self._FRAME_SIZE_TARGET or force: - with f.getbuffer() as data: - write = self.file_write - # Issue a single call to the write nethod of the underlying - # file object for the frame opcode with the size of the - # frame. The concatenation is expected to be less expensive - # than issuing an additional call to write. - write(FRAME + pack(" 0: n_frames += 1 # There should be at least one call to write per frame - self.assertGreaterEqual(len(w.chunks), n_frames) + self.assertGreaterEqual(len(writer.chunks), n_frames) # but not too many either: there can be one for the proto, # one per-frame header and one per frame for the actual contents. - self.assertGreaterEqual(2 * n_frames + 1, len(w.chunks)) + self.assertGreaterEqual(2 * n_frames + 1, len(writer.chunks)) - chunk_sizes = [len(c) for c in w.chunks[:-1]] + chunk_sizes = [len(c) for c in writer.chunks[:-1]] large_sizes = [s for s in chunk_sizes if s >= self.FRAME_SIZE_TARGET] small_sizes = [s for s in chunk_sizes @@ -2208,7 +2222,7 @@ def write(self, chunk): for chunk_size in large_sizes: self.assertGreater(2 * self.FRAME_SIZE_TARGET, chunk_size) - last_chunk_size = len(w.chunks[-1]) + last_chunk_size = len(writer.chunks[-1]) self.assertGreater(2 * self.FRAME_SIZE_TARGET, last_chunk_size) # Small chunks (if any) should be very small From 796c46988110c2fb4315abe239871e0f0faed41d Mon Sep 17 00:00:00 2001 From: Olivier Grisel Date: Sun, 3 Dec 2017 23:21:43 +0100 Subject: [PATCH 34/39] Fix renamed test_framed_write_sizes_with_delayed_writer in test_pickletools.py --- Lib/test/pickletester.py | 2 +- Lib/test/test_pickletools.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/Lib/test/pickletester.py b/Lib/test/pickletester.py index a29ceaaa510b34..412cfc7d822e54 100644 --- a/Lib/test/pickletester.py +++ b/Lib/test/pickletester.py @@ -2179,7 +2179,7 @@ def concatenate_chunks(self): # Some chunks can be memoryview instances, we need to convert # them to bytes to be able to call join return b"".join([c.tobytes() if hasattr(c, 'tobytes') else c - for c in w.chunks]) + for c in self.chunks]) small_objects = [(str(i).encode('ascii'), i % 42, {'i': str(i)}) for i in range(int(1e4))] diff --git a/Lib/test/test_pickletools.py b/Lib/test/test_pickletools.py index 4b5e4ecb39a2c8..1c1b7adb221ce3 100644 --- a/Lib/test/test_pickletools.py +++ b/Lib/test/test_pickletools.py @@ -17,7 +17,7 @@ def loads(self, buf, **kwds): test_pickle_to_2x = None # Test relies on writing by chunks into a file object. - test_framed_write_sizes = None + test_framed_write_sizes_with_delayed_writer = None def test_optimize_long_binget(self): data = [str(i) for i in range(257)] From a2e87e9d88daacf50a294617be471bea49f992b4 Mon Sep 17 00:00:00 2001 From: Olivier Grisel Date: Thu, 4 Jan 2018 09:32:48 +0100 Subject: [PATCH 35/39] Add comment to explain flush after commit in _Pickler_OpcodeBoundary --- Modules/_pickle.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/Modules/_pickle.c b/Modules/_pickle.c index 104c99aca7302b..27f4ce521ac14d 100644 --- a/Modules/_pickle.c +++ b/Modules/_pickle.c @@ -951,6 +951,13 @@ _Pickler_OpcodeBoundary(PicklerObject *self) if(_Pickler_CommitFrame(self)) { return -1; } + /* Flush the content of the commited frame to the underlying + * file and reuse the pickler buffer for the next frame so as + * to limit memory usage when dumping large complex objects to + * a file. + * + * self-write is NULL when called via dumps. + */ if (self->write != NULL) { if (_Pickler_FlushToFile(self) < 0) { return -1; From 8edaa6414650041cf8bb778d3c93783661144a6f Mon Sep 17 00:00:00 2001 From: Olivier Grisel Date: Thu, 4 Jan 2018 09:35:57 +0100 Subject: [PATCH 36/39] Removed unused argument payload_size --- Modules/_pickle.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Modules/_pickle.c b/Modules/_pickle.c index 27f4ce521ac14d..041383b451a7a0 100644 --- a/Modules/_pickle.c +++ b/Modules/_pickle.c @@ -2080,7 +2080,7 @@ save_float(PicklerObject *self, PyObject *obj) static int _Pickler_write_large_bytes( PicklerObject *self, const char *header, Py_ssize_t header_size, - PyObject *payload, Py_ssize_t payload_size) + PyObject *payload) { assert(self->output_buffer != NULL); assert(self->write != NULL); @@ -2209,7 +2209,7 @@ save_bytes(PicklerObject *self, PyObject *obj) else { /* Bypass the in-memory buffer to directly stream large data into the underlying file object. */ - if (_Pickler_write_large_bytes(self, header, len, obj, size) < 0) { + if (_Pickler_write_large_bytes(self, header, len, obj) < 0) { return -1; } } @@ -2339,7 +2339,7 @@ write_utf8(PicklerObject *self, const char *data, Py_ssize_t size) if (mem == NULL) { return -1; } - if (_Pickler_write_large_bytes(self, header, len, mem, size) < 0) { + if (_Pickler_write_large_bytes(self, header, len, mem) < 0) { Py_DECREF(mem); return -1; } From 609ebfe7a279c59cf9e8dd20c0eebe0afa304414 Mon Sep 17 00:00:00 2001 From: Olivier Grisel Date: Thu, 4 Jan 2018 09:56:41 +0100 Subject: [PATCH 37/39] Update the changelog to reflect the new behavior of the Python pickler --- .../next/Library/2017-11-10-00-05-08.bpo-31993.-OMNg8.rst | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/Misc/NEWS.d/next/Library/2017-11-10-00-05-08.bpo-31993.-OMNg8.rst b/Misc/NEWS.d/next/Library/2017-11-10-00-05-08.bpo-31993.-OMNg8.rst index 595826ae5f54a5..d6e8cfcacbce20 100644 --- a/Misc/NEWS.d/next/Library/2017-11-10-00-05-08.bpo-31993.-OMNg8.rst +++ b/Misc/NEWS.d/next/Library/2017-11-10-00-05-08.bpo-31993.-OMNg8.rst @@ -5,3 +5,10 @@ directly streamed into the underlying file object. Previously the C implementation would buffer all content and issue a single call to ``file.write`` at the end of the dump. With protocol 4 this behavior has changed to issue one call to ``file.write`` per frame. + +The Python pickler with protocol 4 now dump each frame content as a +memoryview to an IOBytes instance that will never be reused and the +memoryview is no longer released after the call to write. This makes it +possible for the file object to delay access to the memoryview of +previous frames without forcing any additional memory copy as was +already possible with the C pickler. From 28f629784f6b8cd6dd6d942146cf37be73595cf4 Mon Sep 17 00:00:00 2001 From: Olivier Grisel Date: Thu, 4 Jan 2018 09:57:59 +0100 Subject: [PATCH 38/39] Fix phrasing --- .../next/Library/2017-11-10-00-05-08.bpo-31993.-OMNg8.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Misc/NEWS.d/next/Library/2017-11-10-00-05-08.bpo-31993.-OMNg8.rst b/Misc/NEWS.d/next/Library/2017-11-10-00-05-08.bpo-31993.-OMNg8.rst index d6e8cfcacbce20..b453e21c80c917 100644 --- a/Misc/NEWS.d/next/Library/2017-11-10-00-05-08.bpo-31993.-OMNg8.rst +++ b/Misc/NEWS.d/next/Library/2017-11-10-00-05-08.bpo-31993.-OMNg8.rst @@ -6,8 +6,8 @@ Previously the C implementation would buffer all content and issue a single call to ``file.write`` at the end of the dump. With protocol 4 this behavior has changed to issue one call to ``file.write`` per frame. -The Python pickler with protocol 4 now dump each frame content as a -memoryview to an IOBytes instance that will never be reused and the +The Python pickler with protocol 4 now dumps each frame content as a +memoryview to an IOBytes instance that is never reused and the memoryview is no longer released after the call to write. This makes it possible for the file object to delay access to the memoryview of previous frames without forcing any additional memory copy as was From 7c91f061f69158308eaa0e408ca883b4e3d03d46 Mon Sep 17 00:00:00 2001 From: Olivier Grisel Date: Fri, 5 Jan 2018 13:51:04 +0100 Subject: [PATCH 39/39] Fixed typos in comments --- Lib/pickle.py | 2 +- Modules/_pickle.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/Lib/pickle.py b/Lib/pickle.py index 2b5f98ceb22f20..087c9ffdb8f752 100644 --- a/Lib/pickle.py +++ b/Lib/pickle.py @@ -203,7 +203,7 @@ def commit_frame(self, force=False): if f.tell() >= self._FRAME_SIZE_TARGET or force: data = f.getbuffer() write = self.file_write - # Issue a single call to the write nethod of the underlying + # Issue a single call to the write method of the underlying # file object for the frame opcode with the size of the # frame. The concatenation is expected to be less expensive # than issuing an additional call to write. diff --git a/Modules/_pickle.c b/Modules/_pickle.c index 041383b451a7a0..65d7aba4b967b8 100644 --- a/Modules/_pickle.c +++ b/Modules/_pickle.c @@ -956,7 +956,7 @@ _Pickler_OpcodeBoundary(PicklerObject *self) * to limit memory usage when dumping large complex objects to * a file. * - * self-write is NULL when called via dumps. + * self->write is NULL when called via dumps. */ if (self->write != NULL) { if (_Pickler_FlushToFile(self) < 0) {