__init__.py 10 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342
  1. # coding: utf-8
  2. """
  3. webencodings
  4. ~~~~~~~~~~~~
  5. This is a Python implementation of the `WHATWG Encoding standard
  6. <http://encoding.spec.whatwg.org/>`. See README for details.
  7. :copyright: Copyright 2012 by Simon Sapin
  8. :license: BSD, see LICENSE for details.
  9. """
  10. from __future__ import unicode_literals
  11. import codecs
  12. from .labels import LABELS
  13. VERSION = '0.5.1'
  14. # Some names in Encoding are not valid Python aliases. Remap these.
  15. PYTHON_NAMES = {
  16. 'iso-8859-8-i': 'iso-8859-8',
  17. 'x-mac-cyrillic': 'mac-cyrillic',
  18. 'macintosh': 'mac-roman',
  19. 'windows-874': 'cp874'}
  20. CACHE = {}
  21. def ascii_lower(string):
  22. r"""Transform (only) ASCII letters to lower case: A-Z is mapped to a-z.
  23. :param string: An Unicode string.
  24. :returns: A new Unicode string.
  25. This is used for `ASCII case-insensitive
  26. <http://encoding.spec.whatwg.org/#ascii-case-insensitive>`_
  27. matching of encoding labels.
  28. The same matching is also used, among other things,
  29. for `CSS keywords <http://dev.w3.org/csswg/css-values/#keywords>`_.
  30. This is different from the :meth:`~py:str.lower` method of Unicode strings
  31. which also affect non-ASCII characters,
  32. sometimes mapping them into the ASCII range:
  33. >>> keyword = u'Bac\N{KELVIN SIGN}ground'
  34. >>> assert keyword.lower() == u'background'
  35. >>> assert ascii_lower(keyword) != keyword.lower()
  36. >>> assert ascii_lower(keyword) == u'bac\N{KELVIN SIGN}ground'
  37. """
  38. # This turns out to be faster than unicode.translate()
  39. return string.encode('utf8').lower().decode('utf8')
  40. def lookup(label):
  41. """
  42. Look for an encoding by its label.
  43. This is the spec’s `get an encoding
  44. <http://encoding.spec.whatwg.org/#concept-encoding-get>`_ algorithm.
  45. Supported labels are listed there.
  46. :param label: A string.
  47. :returns:
  48. An :class:`Encoding` object, or :obj:`None` for an unknown label.
  49. """
  50. # Only strip ASCII whitespace: U+0009, U+000A, U+000C, U+000D, and U+0020.
  51. label = ascii_lower(label.strip('\t\n\f\r '))
  52. name = LABELS.get(label)
  53. if name is None:
  54. return None
  55. encoding = CACHE.get(name)
  56. if encoding is None:
  57. if name == 'x-user-defined':
  58. from .x_user_defined import codec_info
  59. else:
  60. python_name = PYTHON_NAMES.get(name, name)
  61. # Any python_name value that gets to here should be valid.
  62. codec_info = codecs.lookup(python_name)
  63. encoding = Encoding(name, codec_info)
  64. CACHE[name] = encoding
  65. return encoding
  66. def _get_encoding(encoding_or_label):
  67. """
  68. Accept either an encoding object or label.
  69. :param encoding: An :class:`Encoding` object or a label string.
  70. :returns: An :class:`Encoding` object.
  71. :raises: :exc:`~exceptions.LookupError` for an unknown label.
  72. """
  73. if hasattr(encoding_or_label, 'codec_info'):
  74. return encoding_or_label
  75. encoding = lookup(encoding_or_label)
  76. if encoding is None:
  77. raise LookupError('Unknown encoding label: %r' % encoding_or_label)
  78. return encoding
  79. class Encoding(object):
  80. """Reresents a character encoding such as UTF-8,
  81. that can be used for decoding or encoding.
  82. .. attribute:: name
  83. Canonical name of the encoding
  84. .. attribute:: codec_info
  85. The actual implementation of the encoding,
  86. a stdlib :class:`~codecs.CodecInfo` object.
  87. See :func:`codecs.register`.
  88. """
  89. def __init__(self, name, codec_info):
  90. self.name = name
  91. self.codec_info = codec_info
  92. def __repr__(self):
  93. return '<Encoding %s>' % self.name
  94. #: The UTF-8 encoding. Should be used for new content and formats.
  95. UTF8 = lookup('utf-8')
  96. _UTF16LE = lookup('utf-16le')
  97. _UTF16BE = lookup('utf-16be')
  98. def decode(input, fallback_encoding, errors='replace'):
  99. """
  100. Decode a single string.
  101. :param input: A byte string
  102. :param fallback_encoding:
  103. An :class:`Encoding` object or a label string.
  104. The encoding to use if :obj:`input` does note have a BOM.
  105. :param errors: Type of error handling. See :func:`codecs.register`.
  106. :raises: :exc:`~exceptions.LookupError` for an unknown encoding label.
  107. :return:
  108. A ``(output, encoding)`` tuple of an Unicode string
  109. and an :obj:`Encoding`.
  110. """
  111. # Fail early if `encoding` is an invalid label.
  112. fallback_encoding = _get_encoding(fallback_encoding)
  113. bom_encoding, input = _detect_bom(input)
  114. encoding = bom_encoding or fallback_encoding
  115. return encoding.codec_info.decode(input, errors)[0], encoding
  116. def _detect_bom(input):
  117. """Return (bom_encoding, input), with any BOM removed from the input."""
  118. if input.startswith(b'\xFF\xFE'):
  119. return _UTF16LE, input[2:]
  120. if input.startswith(b'\xFE\xFF'):
  121. return _UTF16BE, input[2:]
  122. if input.startswith(b'\xEF\xBB\xBF'):
  123. return UTF8, input[3:]
  124. return None, input
  125. def encode(input, encoding=UTF8, errors='strict'):
  126. """
  127. Encode a single string.
  128. :param input: An Unicode string.
  129. :param encoding: An :class:`Encoding` object or a label string.
  130. :param errors: Type of error handling. See :func:`codecs.register`.
  131. :raises: :exc:`~exceptions.LookupError` for an unknown encoding label.
  132. :return: A byte string.
  133. """
  134. return _get_encoding(encoding).codec_info.encode(input, errors)[0]
  135. def iter_decode(input, fallback_encoding, errors='replace'):
  136. """
  137. "Pull"-based decoder.
  138. :param input:
  139. An iterable of byte strings.
  140. The input is first consumed just enough to determine the encoding
  141. based on the precense of a BOM,
  142. then consumed on demand when the return value is.
  143. :param fallback_encoding:
  144. An :class:`Encoding` object or a label string.
  145. The encoding to use if :obj:`input` does note have a BOM.
  146. :param errors: Type of error handling. See :func:`codecs.register`.
  147. :raises: :exc:`~exceptions.LookupError` for an unknown encoding label.
  148. :returns:
  149. An ``(output, encoding)`` tuple.
  150. :obj:`output` is an iterable of Unicode strings,
  151. :obj:`encoding` is the :obj:`Encoding` that is being used.
  152. """
  153. decoder = IncrementalDecoder(fallback_encoding, errors)
  154. generator = _iter_decode_generator(input, decoder)
  155. encoding = next(generator)
  156. return generator, encoding
  157. def _iter_decode_generator(input, decoder):
  158. """Return a generator that first yields the :obj:`Encoding`,
  159. then yields output chukns as Unicode strings.
  160. """
  161. decode = decoder.decode
  162. input = iter(input)
  163. for chunck in input:
  164. output = decode(chunck)
  165. if output:
  166. assert decoder.encoding is not None
  167. yield decoder.encoding
  168. yield output
  169. break
  170. else:
  171. # Input exhausted without determining the encoding
  172. output = decode(b'', final=True)
  173. assert decoder.encoding is not None
  174. yield decoder.encoding
  175. if output:
  176. yield output
  177. return
  178. for chunck in input:
  179. output = decode(chunck)
  180. if output:
  181. yield output
  182. output = decode(b'', final=True)
  183. if output:
  184. yield output
  185. def iter_encode(input, encoding=UTF8, errors='strict'):
  186. """
  187. “Pull”-based encoder.
  188. :param input: An iterable of Unicode strings.
  189. :param encoding: An :class:`Encoding` object or a label string.
  190. :param errors: Type of error handling. See :func:`codecs.register`.
  191. :raises: :exc:`~exceptions.LookupError` for an unknown encoding label.
  192. :returns: An iterable of byte strings.
  193. """
  194. # Fail early if `encoding` is an invalid label.
  195. encode = IncrementalEncoder(encoding, errors).encode
  196. return _iter_encode_generator(input, encode)
  197. def _iter_encode_generator(input, encode):
  198. for chunck in input:
  199. output = encode(chunck)
  200. if output:
  201. yield output
  202. output = encode('', final=True)
  203. if output:
  204. yield output
  205. class IncrementalDecoder(object):
  206. """
  207. “Push”-based decoder.
  208. :param fallback_encoding:
  209. An :class:`Encoding` object or a label string.
  210. The encoding to use if :obj:`input` does note have a BOM.
  211. :param errors: Type of error handling. See :func:`codecs.register`.
  212. :raises: :exc:`~exceptions.LookupError` for an unknown encoding label.
  213. """
  214. def __init__(self, fallback_encoding, errors='replace'):
  215. # Fail early if `encoding` is an invalid label.
  216. self._fallback_encoding = _get_encoding(fallback_encoding)
  217. self._errors = errors
  218. self._buffer = b''
  219. self._decoder = None
  220. #: The actual :class:`Encoding` that is being used,
  221. #: or :obj:`None` if that is not determined yet.
  222. #: (Ie. if there is not enough input yet to determine
  223. #: if there is a BOM.)
  224. self.encoding = None # Not known yet.
  225. def decode(self, input, final=False):
  226. """Decode one chunk of the input.
  227. :param input: A byte string.
  228. :param final:
  229. Indicate that no more input is available.
  230. Must be :obj:`True` if this is the last call.
  231. :returns: An Unicode string.
  232. """
  233. decoder = self._decoder
  234. if decoder is not None:
  235. return decoder(input, final)
  236. input = self._buffer + input
  237. encoding, input = _detect_bom(input)
  238. if encoding is None:
  239. if len(input) < 3 and not final: # Not enough data yet.
  240. self._buffer = input
  241. return ''
  242. else: # No BOM
  243. encoding = self._fallback_encoding
  244. decoder = encoding.codec_info.incrementaldecoder(self._errors).decode
  245. self._decoder = decoder
  246. self.encoding = encoding
  247. return decoder(input, final)
  248. class IncrementalEncoder(object):
  249. """
  250. “Push”-based encoder.
  251. :param encoding: An :class:`Encoding` object or a label string.
  252. :param errors: Type of error handling. See :func:`codecs.register`.
  253. :raises: :exc:`~exceptions.LookupError` for an unknown encoding label.
  254. .. method:: encode(input, final=False)
  255. :param input: An Unicode string.
  256. :param final:
  257. Indicate that no more input is available.
  258. Must be :obj:`True` if this is the last call.
  259. :returns: A byte string.
  260. """
  261. def __init__(self, encoding=UTF8, errors='strict'):
  262. encoding = _get_encoding(encoding)
  263. self.encode = encoding.codec_info.incrementalencoder(errors).encode