memmap.py 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334
  1. import numpy as np
  2. from .numeric import uint8, ndarray, dtype
  3. from numpy.compat import (
  4. os_fspath, contextlib_nullcontext, is_pathlib_path
  5. )
  6. from numpy.core.overrides import set_module
  7. __all__ = ['memmap']
  8. dtypedescr = dtype
  9. valid_filemodes = ["r", "c", "r+", "w+"]
  10. writeable_filemodes = ["r+", "w+"]
  11. mode_equivalents = {
  12. "readonly":"r",
  13. "copyonwrite":"c",
  14. "readwrite":"r+",
  15. "write":"w+"
  16. }
  17. @set_module('numpy')
  18. class memmap(ndarray):
  19. """Create a memory-map to an array stored in a *binary* file on disk.
  20. Memory-mapped files are used for accessing small segments of large files
  21. on disk, without reading the entire file into memory. NumPy's
  22. memmap's are array-like objects. This differs from Python's ``mmap``
  23. module, which uses file-like objects.
  24. This subclass of ndarray has some unpleasant interactions with
  25. some operations, because it doesn't quite fit properly as a subclass.
  26. An alternative to using this subclass is to create the ``mmap``
  27. object yourself, then create an ndarray with ndarray.__new__ directly,
  28. passing the object created in its 'buffer=' parameter.
  29. This class may at some point be turned into a factory function
  30. which returns a view into an mmap buffer.
  31. Delete the memmap instance to close the memmap file.
  32. Parameters
  33. ----------
  34. filename : str, file-like object, or pathlib.Path instance
  35. The file name or file object to be used as the array data buffer.
  36. dtype : data-type, optional
  37. The data-type used to interpret the file contents.
  38. Default is `uint8`.
  39. mode : {'r+', 'r', 'w+', 'c'}, optional
  40. The file is opened in this mode:
  41. +------+-------------------------------------------------------------+
  42. | 'r' | Open existing file for reading only. |
  43. +------+-------------------------------------------------------------+
  44. | 'r+' | Open existing file for reading and writing. |
  45. +------+-------------------------------------------------------------+
  46. | 'w+' | Create or overwrite existing file for reading and writing. |
  47. +------+-------------------------------------------------------------+
  48. | 'c' | Copy-on-write: assignments affect data in memory, but |
  49. | | changes are not saved to disk. The file on disk is |
  50. | | read-only. |
  51. +------+-------------------------------------------------------------+
  52. Default is 'r+'.
  53. offset : int, optional
  54. In the file, array data starts at this offset. Since `offset` is
  55. measured in bytes, it should normally be a multiple of the byte-size
  56. of `dtype`. When ``mode != 'r'``, even positive offsets beyond end of
  57. file are valid; The file will be extended to accommodate the
  58. additional data. By default, ``memmap`` will start at the beginning of
  59. the file, even if ``filename`` is a file pointer ``fp`` and
  60. ``fp.tell() != 0``.
  61. shape : tuple, optional
  62. The desired shape of the array. If ``mode == 'r'`` and the number
  63. of remaining bytes after `offset` is not a multiple of the byte-size
  64. of `dtype`, you must specify `shape`. By default, the returned array
  65. will be 1-D with the number of elements determined by file size
  66. and data-type.
  67. order : {'C', 'F'}, optional
  68. Specify the order of the ndarray memory layout:
  69. :term:`row-major`, C-style or :term:`column-major`,
  70. Fortran-style. This only has an effect if the shape is
  71. greater than 1-D. The default order is 'C'.
  72. Attributes
  73. ----------
  74. filename : str or pathlib.Path instance
  75. Path to the mapped file.
  76. offset : int
  77. Offset position in the file.
  78. mode : str
  79. File mode.
  80. Methods
  81. -------
  82. flush
  83. Flush any changes in memory to file on disk.
  84. When you delete a memmap object, flush is called first to write
  85. changes to disk before removing the object.
  86. See also
  87. --------
  88. lib.format.open_memmap : Create or load a memory-mapped ``.npy`` file.
  89. Notes
  90. -----
  91. The memmap object can be used anywhere an ndarray is accepted.
  92. Given a memmap ``fp``, ``isinstance(fp, numpy.ndarray)`` returns
  93. ``True``.
  94. Memory-mapped files cannot be larger than 2GB on 32-bit systems.
  95. When a memmap causes a file to be created or extended beyond its
  96. current size in the filesystem, the contents of the new part are
  97. unspecified. On systems with POSIX filesystem semantics, the extended
  98. part will be filled with zero bytes.
  99. Examples
  100. --------
  101. >>> data = np.arange(12, dtype='float32')
  102. >>> data.resize((3,4))
  103. This example uses a temporary file so that doctest doesn't write
  104. files to your directory. You would use a 'normal' filename.
  105. >>> from tempfile import mkdtemp
  106. >>> import os.path as path
  107. >>> filename = path.join(mkdtemp(), 'newfile.dat')
  108. Create a memmap with dtype and shape that matches our data:
  109. >>> fp = np.memmap(filename, dtype='float32', mode='w+', shape=(3,4))
  110. >>> fp
  111. memmap([[0., 0., 0., 0.],
  112. [0., 0., 0., 0.],
  113. [0., 0., 0., 0.]], dtype=float32)
  114. Write data to memmap array:
  115. >>> fp[:] = data[:]
  116. >>> fp
  117. memmap([[ 0., 1., 2., 3.],
  118. [ 4., 5., 6., 7.],
  119. [ 8., 9., 10., 11.]], dtype=float32)
  120. >>> fp.filename == path.abspath(filename)
  121. True
  122. Deletion flushes memory changes to disk before removing the object:
  123. >>> del fp
  124. Load the memmap and verify data was stored:
  125. >>> newfp = np.memmap(filename, dtype='float32', mode='r', shape=(3,4))
  126. >>> newfp
  127. memmap([[ 0., 1., 2., 3.],
  128. [ 4., 5., 6., 7.],
  129. [ 8., 9., 10., 11.]], dtype=float32)
  130. Read-only memmap:
  131. >>> fpr = np.memmap(filename, dtype='float32', mode='r', shape=(3,4))
  132. >>> fpr.flags.writeable
  133. False
  134. Copy-on-write memmap:
  135. >>> fpc = np.memmap(filename, dtype='float32', mode='c', shape=(3,4))
  136. >>> fpc.flags.writeable
  137. True
  138. It's possible to assign to copy-on-write array, but values are only
  139. written into the memory copy of the array, and not written to disk:
  140. >>> fpc
  141. memmap([[ 0., 1., 2., 3.],
  142. [ 4., 5., 6., 7.],
  143. [ 8., 9., 10., 11.]], dtype=float32)
  144. >>> fpc[0,:] = 0
  145. >>> fpc
  146. memmap([[ 0., 0., 0., 0.],
  147. [ 4., 5., 6., 7.],
  148. [ 8., 9., 10., 11.]], dtype=float32)
  149. File on disk is unchanged:
  150. >>> fpr
  151. memmap([[ 0., 1., 2., 3.],
  152. [ 4., 5., 6., 7.],
  153. [ 8., 9., 10., 11.]], dtype=float32)
  154. Offset into a memmap:
  155. >>> fpo = np.memmap(filename, dtype='float32', mode='r', offset=16)
  156. >>> fpo
  157. memmap([ 4., 5., 6., 7., 8., 9., 10., 11.], dtype=float32)
  158. """
  159. __array_priority__ = -100.0
  160. def __new__(subtype, filename, dtype=uint8, mode='r+', offset=0,
  161. shape=None, order='C'):
  162. # Import here to minimize 'import numpy' overhead
  163. import mmap
  164. import os.path
  165. try:
  166. mode = mode_equivalents[mode]
  167. except KeyError as e:
  168. if mode not in valid_filemodes:
  169. raise ValueError(
  170. "mode must be one of {!r} (got {!r})"
  171. .format(valid_filemodes + list(mode_equivalents.keys()), mode)
  172. ) from None
  173. if mode == 'w+' and shape is None:
  174. raise ValueError("shape must be given")
  175. if hasattr(filename, 'read'):
  176. f_ctx = contextlib_nullcontext(filename)
  177. else:
  178. f_ctx = open(os_fspath(filename), ('r' if mode == 'c' else mode)+'b')
  179. with f_ctx as fid:
  180. fid.seek(0, 2)
  181. flen = fid.tell()
  182. descr = dtypedescr(dtype)
  183. _dbytes = descr.itemsize
  184. if shape is None:
  185. bytes = flen - offset
  186. if bytes % _dbytes:
  187. raise ValueError("Size of available data is not a "
  188. "multiple of the data-type size.")
  189. size = bytes // _dbytes
  190. shape = (size,)
  191. else:
  192. if not isinstance(shape, tuple):
  193. shape = (shape,)
  194. size = np.intp(1) # avoid default choice of np.int_, which might overflow
  195. for k in shape:
  196. size *= k
  197. bytes = int(offset + size*_dbytes)
  198. if mode in ('w+', 'r+') and flen < bytes:
  199. fid.seek(bytes - 1, 0)
  200. fid.write(b'\0')
  201. fid.flush()
  202. if mode == 'c':
  203. acc = mmap.ACCESS_COPY
  204. elif mode == 'r':
  205. acc = mmap.ACCESS_READ
  206. else:
  207. acc = mmap.ACCESS_WRITE
  208. start = offset - offset % mmap.ALLOCATIONGRANULARITY
  209. bytes -= start
  210. array_offset = offset - start
  211. mm = mmap.mmap(fid.fileno(), bytes, access=acc, offset=start)
  212. self = ndarray.__new__(subtype, shape, dtype=descr, buffer=mm,
  213. offset=array_offset, order=order)
  214. self._mmap = mm
  215. self.offset = offset
  216. self.mode = mode
  217. if is_pathlib_path(filename):
  218. # special case - if we were constructed with a pathlib.path,
  219. # then filename is a path object, not a string
  220. self.filename = filename.resolve()
  221. elif hasattr(fid, "name") and isinstance(fid.name, str):
  222. # py3 returns int for TemporaryFile().name
  223. self.filename = os.path.abspath(fid.name)
  224. # same as memmap copies (e.g. memmap + 1)
  225. else:
  226. self.filename = None
  227. return self
  228. def __array_finalize__(self, obj):
  229. if hasattr(obj, '_mmap') and np.may_share_memory(self, obj):
  230. self._mmap = obj._mmap
  231. self.filename = obj.filename
  232. self.offset = obj.offset
  233. self.mode = obj.mode
  234. else:
  235. self._mmap = None
  236. self.filename = None
  237. self.offset = None
  238. self.mode = None
  239. def flush(self):
  240. """
  241. Write any changes in the array to the file on disk.
  242. For further information, see `memmap`.
  243. Parameters
  244. ----------
  245. None
  246. See Also
  247. --------
  248. memmap
  249. """
  250. if self.base is not None and hasattr(self.base, 'flush'):
  251. self.base.flush()
  252. def __array_wrap__(self, arr, context=None):
  253. arr = super(memmap, self).__array_wrap__(arr, context)
  254. # Return a memmap if a memmap was given as the output of the
  255. # ufunc. Leave the arr class unchanged if self is not a memmap
  256. # to keep original memmap subclasses behavior
  257. if self is arr or type(self) is not memmap:
  258. return arr
  259. # Return scalar instead of 0d memmap, e.g. for np.sum with
  260. # axis=None
  261. if arr.shape == ():
  262. return arr[()]
  263. # Return ndarray otherwise
  264. return arr.view(np.ndarray)
  265. def __getitem__(self, index):
  266. res = super(memmap, self).__getitem__(index)
  267. if type(res) is memmap and res._mmap is None:
  268. return res.view(type=ndarray)
  269. return res