Python 3 - Can pickle handle byte objects larger than 4GB?
Here is a simple workaround for issue 24658. Use pickle.loads
or pickle.dumps
and break the bytes object into chunks of size 2**31 - 1
to get it in or out of the file.
import pickleimport os.pathfile_path = "pkl.pkl"n_bytes = 2**31max_bytes = 2**31 - 1data = bytearray(n_bytes)## writebytes_out = pickle.dumps(data)with open(file_path, 'wb') as f_out: for idx in range(0, len(bytes_out), max_bytes): f_out.write(bytes_out[idx:idx+max_bytes])## readbytes_in = bytearray(0)input_size = os.path.getsize(file_path)with open(file_path, 'rb') as f_in: for _ in range(0, input_size, max_bytes): bytes_in += f_in.read(max_bytes)data2 = pickle.loads(bytes_in)assert(data == data2)
To sum up what was answered in the comments:
Yes, Python can pickle byte objects bigger than 4GB. The observed error is caused by a bug in the implementation (see Issue24658).
Here is the full workaround, though it seems pickle.load no longer tries to dump a huge file anymore (I am on Python 3.5.2) so strictly speaking only the pickle.dumps needs this to work properly.
import pickleclass MacOSFile(object): def __init__(self, f): self.f = f def __getattr__(self, item): return getattr(self.f, item) def read(self, n): # print("reading total_bytes=%s" % n, flush=True) if n >= (1 << 31): buffer = bytearray(n) idx = 0 while idx < n: batch_size = min(n - idx, 1 << 31 - 1) # print("reading bytes [%s,%s)..." % (idx, idx + batch_size), end="", flush=True) buffer[idx:idx + batch_size] = self.f.read(batch_size) # print("done.", flush=True) idx += batch_size return buffer return self.f.read(n) def write(self, buffer): n = len(buffer) print("writing total_bytes=%s..." % n, flush=True) idx = 0 while idx < n: batch_size = min(n - idx, 1 << 31 - 1) print("writing bytes [%s, %s)... " % (idx, idx + batch_size), end="", flush=True) self.f.write(buffer[idx:idx + batch_size]) print("done.", flush=True) idx += batch_sizedef pickle_dump(obj, file_path): with open(file_path, "wb") as f: return pickle.dump(obj, MacOSFile(f), protocol=pickle.HIGHEST_PROTOCOL)def pickle_load(file_path): with open(file_path, "rb") as f: return pickle.load(MacOSFile(f))