File size: 3,946 Bytes
3362754
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
# SPDX-FileCopyrightText: 2015 Eric Larson
#
# SPDX-License-Identifier: Apache-2.0

from tempfile import NamedTemporaryFile
import mmap


class CallbackFileWrapper(object):
    """
    Small wrapper around a fp object which will tee everything read into a
    buffer, and when that file is closed it will execute a callback with the
    contents of that buffer.

    All attributes are proxied to the underlying file object.

    This class uses members with a double underscore (__) leading prefix so as
    not to accidentally shadow an attribute.

    The data is stored in a temporary file until it is all available.  As long
    as the temporary files directory is disk-based (sometimes it's a
    memory-backed-``tmpfs`` on Linux), data will be unloaded to disk if memory
    pressure is high.  For small files the disk usually won't be used at all,
    it'll all be in the filesystem memory cache, so there should be no
    performance impact.
    """

    def __init__(self, fp, callback):
        self.__buf = NamedTemporaryFile("rb+", delete=True)
        self.__fp = fp
        self.__callback = callback

    def __getattr__(self, name):
        # The vaguaries of garbage collection means that self.__fp is
        # not always set.  By using __getattribute__ and the private
        # name[0] allows looking up the attribute value and raising an
        # AttributeError when it doesn't exist. This stop thigns from
        # infinitely recursing calls to getattr in the case where
        # self.__fp hasn't been set.
        #
        # [0] https://docs.python.org/2/reference/expressions.html#atom-identifiers
        fp = self.__getattribute__("_CallbackFileWrapper__fp")
        return getattr(fp, name)

    def __is_fp_closed(self):
        try:
            return self.__fp.fp is None

        except AttributeError:
            pass

        try:
            return self.__fp.closed

        except AttributeError:
            pass

        # We just don't cache it then.
        # TODO: Add some logging here...
        return False

    def _close(self):
        if self.__callback:
            if self.__buf.tell() == 0:
                # Empty file:
                result = b""
            else:
                # Return the data without actually loading it into memory,
                # relying on Python's buffer API and mmap(). mmap() just gives
                # a view directly into the filesystem's memory cache, so it
                # doesn't result in duplicate memory use.
                self.__buf.seek(0, 0)
                result = memoryview(
                    mmap.mmap(self.__buf.fileno(), 0, access=mmap.ACCESS_READ)
                )
            self.__callback(result)

        # We assign this to None here, because otherwise we can get into
        # really tricky problems where the CPython interpreter dead locks
        # because the callback is holding a reference to something which
        # has a __del__ method. Setting this to None breaks the cycle
        # and allows the garbage collector to do it's thing normally.
        self.__callback = None

        # Closing the temporary file releases memory and frees disk space.
        # Important when caching big files.
        self.__buf.close()

    def read(self, amt=None):
        data = self.__fp.read(amt)
        if data:
            # We may be dealing with b'', a sign that things are over:
            # it's passed e.g. after we've already closed self.__buf.
            self.__buf.write(data)
        if self.__is_fp_closed():
            self._close()

        return data

    def _safe_read(self, amt):
        data = self.__fp._safe_read(amt)
        if amt == 2 and data == b"\r\n":
            # urllib executes this read to toss the CRLF at the end
            # of the chunk.
            return data

        self.__buf.write(data)
        if self.__is_fp_closed():
            self._close()

        return data