_inputstream.py 32 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918
  1. from __future__ import absolute_import, division, unicode_literals
  2. from pip._vendor.six import text_type
  3. from pip._vendor.six.moves import http_client, urllib
  4. import codecs
  5. import re
  6. from io import BytesIO, StringIO
  7. from pip._vendor import webencodings
  8. from .constants import EOF, spaceCharacters, asciiLetters, asciiUppercase
  9. from .constants import _ReparseException
  10. from . import _utils
  11. # Non-unicode versions of constants for use in the pre-parser
  12. spaceCharactersBytes = frozenset([item.encode("ascii") for item in spaceCharacters])
  13. asciiLettersBytes = frozenset([item.encode("ascii") for item in asciiLetters])
  14. asciiUppercaseBytes = frozenset([item.encode("ascii") for item in asciiUppercase])
  15. spacesAngleBrackets = spaceCharactersBytes | frozenset([b">", b"<"])
  16. invalid_unicode_no_surrogate = "[\u0001-\u0008\u000B\u000E-\u001F\u007F-\u009F\uFDD0-\uFDEF\uFFFE\uFFFF\U0001FFFE\U0001FFFF\U0002FFFE\U0002FFFF\U0003FFFE\U0003FFFF\U0004FFFE\U0004FFFF\U0005FFFE\U0005FFFF\U0006FFFE\U0006FFFF\U0007FFFE\U0007FFFF\U0008FFFE\U0008FFFF\U0009FFFE\U0009FFFF\U000AFFFE\U000AFFFF\U000BFFFE\U000BFFFF\U000CFFFE\U000CFFFF\U000DFFFE\U000DFFFF\U000EFFFE\U000EFFFF\U000FFFFE\U000FFFFF\U0010FFFE\U0010FFFF]" # noqa
  17. if _utils.supports_lone_surrogates:
  18. # Use one extra step of indirection and create surrogates with
  19. # eval. Not using this indirection would introduce an illegal
  20. # unicode literal on platforms not supporting such lone
  21. # surrogates.
  22. assert invalid_unicode_no_surrogate[-1] == "]" and invalid_unicode_no_surrogate.count("]") == 1
  23. invalid_unicode_re = re.compile(invalid_unicode_no_surrogate[:-1] +
  24. eval('"\\uD800-\\uDFFF"') + # pylint:disable=eval-used
  25. "]")
  26. else:
  27. invalid_unicode_re = re.compile(invalid_unicode_no_surrogate)
  28. non_bmp_invalid_codepoints = {0x1FFFE, 0x1FFFF, 0x2FFFE, 0x2FFFF, 0x3FFFE,
  29. 0x3FFFF, 0x4FFFE, 0x4FFFF, 0x5FFFE, 0x5FFFF,
  30. 0x6FFFE, 0x6FFFF, 0x7FFFE, 0x7FFFF, 0x8FFFE,
  31. 0x8FFFF, 0x9FFFE, 0x9FFFF, 0xAFFFE, 0xAFFFF,
  32. 0xBFFFE, 0xBFFFF, 0xCFFFE, 0xCFFFF, 0xDFFFE,
  33. 0xDFFFF, 0xEFFFE, 0xEFFFF, 0xFFFFE, 0xFFFFF,
  34. 0x10FFFE, 0x10FFFF}
  35. ascii_punctuation_re = re.compile("[\u0009-\u000D\u0020-\u002F\u003A-\u0040\u005C\u005B-\u0060\u007B-\u007E]")
  36. # Cache for charsUntil()
  37. charsUntilRegEx = {}
  38. class BufferedStream(object):
  39. """Buffering for streams that do not have buffering of their own
  40. The buffer is implemented as a list of chunks on the assumption that
  41. joining many strings will be slow since it is O(n**2)
  42. """
  43. def __init__(self, stream):
  44. self.stream = stream
  45. self.buffer = []
  46. self.position = [-1, 0] # chunk number, offset
  47. def tell(self):
  48. pos = 0
  49. for chunk in self.buffer[:self.position[0]]:
  50. pos += len(chunk)
  51. pos += self.position[1]
  52. return pos
  53. def seek(self, pos):
  54. assert pos <= self._bufferedBytes()
  55. offset = pos
  56. i = 0
  57. while len(self.buffer[i]) < offset:
  58. offset -= len(self.buffer[i])
  59. i += 1
  60. self.position = [i, offset]
  61. def read(self, bytes):
  62. if not self.buffer:
  63. return self._readStream(bytes)
  64. elif (self.position[0] == len(self.buffer) and
  65. self.position[1] == len(self.buffer[-1])):
  66. return self._readStream(bytes)
  67. else:
  68. return self._readFromBuffer(bytes)
  69. def _bufferedBytes(self):
  70. return sum([len(item) for item in self.buffer])
  71. def _readStream(self, bytes):
  72. data = self.stream.read(bytes)
  73. self.buffer.append(data)
  74. self.position[0] += 1
  75. self.position[1] = len(data)
  76. return data
  77. def _readFromBuffer(self, bytes):
  78. remainingBytes = bytes
  79. rv = []
  80. bufferIndex = self.position[0]
  81. bufferOffset = self.position[1]
  82. while bufferIndex < len(self.buffer) and remainingBytes != 0:
  83. assert remainingBytes > 0
  84. bufferedData = self.buffer[bufferIndex]
  85. if remainingBytes <= len(bufferedData) - bufferOffset:
  86. bytesToRead = remainingBytes
  87. self.position = [bufferIndex, bufferOffset + bytesToRead]
  88. else:
  89. bytesToRead = len(bufferedData) - bufferOffset
  90. self.position = [bufferIndex, len(bufferedData)]
  91. bufferIndex += 1
  92. rv.append(bufferedData[bufferOffset:bufferOffset + bytesToRead])
  93. remainingBytes -= bytesToRead
  94. bufferOffset = 0
  95. if remainingBytes:
  96. rv.append(self._readStream(remainingBytes))
  97. return b"".join(rv)
  98. def HTMLInputStream(source, **kwargs):
  99. # Work around Python bug #20007: read(0) closes the connection.
  100. # http://bugs.python.org/issue20007
  101. if (isinstance(source, http_client.HTTPResponse) or
  102. # Also check for addinfourl wrapping HTTPResponse
  103. (isinstance(source, urllib.response.addbase) and
  104. isinstance(source.fp, http_client.HTTPResponse))):
  105. isUnicode = False
  106. elif hasattr(source, "read"):
  107. isUnicode = isinstance(source.read(0), text_type)
  108. else:
  109. isUnicode = isinstance(source, text_type)
  110. if isUnicode:
  111. encodings = [x for x in kwargs if x.endswith("_encoding")]
  112. if encodings:
  113. raise TypeError("Cannot set an encoding with a unicode input, set %r" % encodings)
  114. return HTMLUnicodeInputStream(source, **kwargs)
  115. else:
  116. return HTMLBinaryInputStream(source, **kwargs)
  117. class HTMLUnicodeInputStream(object):
  118. """Provides a unicode stream of characters to the HTMLTokenizer.
  119. This class takes care of character encoding and removing or replacing
  120. incorrect byte-sequences and also provides column and line tracking.
  121. """
  122. _defaultChunkSize = 10240
  123. def __init__(self, source):
  124. """Initialises the HTMLInputStream.
  125. HTMLInputStream(source, [encoding]) -> Normalized stream from source
  126. for use by html5lib.
  127. source can be either a file-object, local filename or a string.
  128. The optional encoding parameter must be a string that indicates
  129. the encoding. If specified, that encoding will be used,
  130. regardless of any BOM or later declaration (such as in a meta
  131. element)
  132. """
  133. if not _utils.supports_lone_surrogates:
  134. # Such platforms will have already checked for such
  135. # surrogate errors, so no need to do this checking.
  136. self.reportCharacterErrors = None
  137. elif len("\U0010FFFF") == 1:
  138. self.reportCharacterErrors = self.characterErrorsUCS4
  139. else:
  140. self.reportCharacterErrors = self.characterErrorsUCS2
  141. # List of where new lines occur
  142. self.newLines = [0]
  143. self.charEncoding = (lookupEncoding("utf-8"), "certain")
  144. self.dataStream = self.openStream(source)
  145. self.reset()
  146. def reset(self):
  147. self.chunk = ""
  148. self.chunkSize = 0
  149. self.chunkOffset = 0
  150. self.errors = []
  151. # number of (complete) lines in previous chunks
  152. self.prevNumLines = 0
  153. # number of columns in the last line of the previous chunk
  154. self.prevNumCols = 0
  155. # Deal with CR LF and surrogates split over chunk boundaries
  156. self._bufferedCharacter = None
  157. def openStream(self, source):
  158. """Produces a file object from source.
  159. source can be either a file object, local filename or a string.
  160. """
  161. # Already a file object
  162. if hasattr(source, 'read'):
  163. stream = source
  164. else:
  165. stream = StringIO(source)
  166. return stream
  167. def _position(self, offset):
  168. chunk = self.chunk
  169. nLines = chunk.count('\n', 0, offset)
  170. positionLine = self.prevNumLines + nLines
  171. lastLinePos = chunk.rfind('\n', 0, offset)
  172. if lastLinePos == -1:
  173. positionColumn = self.prevNumCols + offset
  174. else:
  175. positionColumn = offset - (lastLinePos + 1)
  176. return (positionLine, positionColumn)
  177. def position(self):
  178. """Returns (line, col) of the current position in the stream."""
  179. line, col = self._position(self.chunkOffset)
  180. return (line + 1, col)
  181. def char(self):
  182. """ Read one character from the stream or queue if available. Return
  183. EOF when EOF is reached.
  184. """
  185. # Read a new chunk from the input stream if necessary
  186. if self.chunkOffset >= self.chunkSize:
  187. if not self.readChunk():
  188. return EOF
  189. chunkOffset = self.chunkOffset
  190. char = self.chunk[chunkOffset]
  191. self.chunkOffset = chunkOffset + 1
  192. return char
  193. def readChunk(self, chunkSize=None):
  194. if chunkSize is None:
  195. chunkSize = self._defaultChunkSize
  196. self.prevNumLines, self.prevNumCols = self._position(self.chunkSize)
  197. self.chunk = ""
  198. self.chunkSize = 0
  199. self.chunkOffset = 0
  200. data = self.dataStream.read(chunkSize)
  201. # Deal with CR LF and surrogates broken across chunks
  202. if self._bufferedCharacter:
  203. data = self._bufferedCharacter + data
  204. self._bufferedCharacter = None
  205. elif not data:
  206. # We have no more data, bye-bye stream
  207. return False
  208. if len(data) > 1:
  209. lastv = ord(data[-1])
  210. if lastv == 0x0D or 0xD800 <= lastv <= 0xDBFF:
  211. self._bufferedCharacter = data[-1]
  212. data = data[:-1]
  213. if self.reportCharacterErrors:
  214. self.reportCharacterErrors(data)
  215. # Replace invalid characters
  216. data = data.replace("\r\n", "\n")
  217. data = data.replace("\r", "\n")
  218. self.chunk = data
  219. self.chunkSize = len(data)
  220. return True
  221. def characterErrorsUCS4(self, data):
  222. for _ in range(len(invalid_unicode_re.findall(data))):
  223. self.errors.append("invalid-codepoint")
  224. def characterErrorsUCS2(self, data):
  225. # Someone picked the wrong compile option
  226. # You lose
  227. skip = False
  228. for match in invalid_unicode_re.finditer(data):
  229. if skip:
  230. continue
  231. codepoint = ord(match.group())
  232. pos = match.start()
  233. # Pretty sure there should be endianness issues here
  234. if _utils.isSurrogatePair(data[pos:pos + 2]):
  235. # We have a surrogate pair!
  236. char_val = _utils.surrogatePairToCodepoint(data[pos:pos + 2])
  237. if char_val in non_bmp_invalid_codepoints:
  238. self.errors.append("invalid-codepoint")
  239. skip = True
  240. elif (codepoint >= 0xD800 and codepoint <= 0xDFFF and
  241. pos == len(data) - 1):
  242. self.errors.append("invalid-codepoint")
  243. else:
  244. skip = False
  245. self.errors.append("invalid-codepoint")
  246. def charsUntil(self, characters, opposite=False):
  247. """ Returns a string of characters from the stream up to but not
  248. including any character in 'characters' or EOF. 'characters' must be
  249. a container that supports the 'in' method and iteration over its
  250. characters.
  251. """
  252. # Use a cache of regexps to find the required characters
  253. try:
  254. chars = charsUntilRegEx[(characters, opposite)]
  255. except KeyError:
  256. if __debug__:
  257. for c in characters:
  258. assert(ord(c) < 128)
  259. regex = "".join(["\\x%02x" % ord(c) for c in characters])
  260. if not opposite:
  261. regex = "^%s" % regex
  262. chars = charsUntilRegEx[(characters, opposite)] = re.compile("[%s]+" % regex)
  263. rv = []
  264. while True:
  265. # Find the longest matching prefix
  266. m = chars.match(self.chunk, self.chunkOffset)
  267. if m is None:
  268. # If nothing matched, and it wasn't because we ran out of chunk,
  269. # then stop
  270. if self.chunkOffset != self.chunkSize:
  271. break
  272. else:
  273. end = m.end()
  274. # If not the whole chunk matched, return everything
  275. # up to the part that didn't match
  276. if end != self.chunkSize:
  277. rv.append(self.chunk[self.chunkOffset:end])
  278. self.chunkOffset = end
  279. break
  280. # If the whole remainder of the chunk matched,
  281. # use it all and read the next chunk
  282. rv.append(self.chunk[self.chunkOffset:])
  283. if not self.readChunk():
  284. # Reached EOF
  285. break
  286. r = "".join(rv)
  287. return r
  288. def unget(self, char):
  289. # Only one character is allowed to be ungotten at once - it must
  290. # be consumed again before any further call to unget
  291. if char is not EOF:
  292. if self.chunkOffset == 0:
  293. # unget is called quite rarely, so it's a good idea to do
  294. # more work here if it saves a bit of work in the frequently
  295. # called char and charsUntil.
  296. # So, just prepend the ungotten character onto the current
  297. # chunk:
  298. self.chunk = char + self.chunk
  299. self.chunkSize += 1
  300. else:
  301. self.chunkOffset -= 1
  302. assert self.chunk[self.chunkOffset] == char
  303. class HTMLBinaryInputStream(HTMLUnicodeInputStream):
  304. """Provides a unicode stream of characters to the HTMLTokenizer.
  305. This class takes care of character encoding and removing or replacing
  306. incorrect byte-sequences and also provides column and line tracking.
  307. """
  308. def __init__(self, source, override_encoding=None, transport_encoding=None,
  309. same_origin_parent_encoding=None, likely_encoding=None,
  310. default_encoding="windows-1252", useChardet=True):
  311. """Initialises the HTMLInputStream.
  312. HTMLInputStream(source, [encoding]) -> Normalized stream from source
  313. for use by html5lib.
  314. source can be either a file-object, local filename or a string.
  315. The optional encoding parameter must be a string that indicates
  316. the encoding. If specified, that encoding will be used,
  317. regardless of any BOM or later declaration (such as in a meta
  318. element)
  319. """
  320. # Raw Stream - for unicode objects this will encode to utf-8 and set
  321. # self.charEncoding as appropriate
  322. self.rawStream = self.openStream(source)
  323. HTMLUnicodeInputStream.__init__(self, self.rawStream)
  324. # Encoding Information
  325. # Number of bytes to use when looking for a meta element with
  326. # encoding information
  327. self.numBytesMeta = 1024
  328. # Number of bytes to use when using detecting encoding using chardet
  329. self.numBytesChardet = 100
  330. # Things from args
  331. self.override_encoding = override_encoding
  332. self.transport_encoding = transport_encoding
  333. self.same_origin_parent_encoding = same_origin_parent_encoding
  334. self.likely_encoding = likely_encoding
  335. self.default_encoding = default_encoding
  336. # Determine encoding
  337. self.charEncoding = self.determineEncoding(useChardet)
  338. assert self.charEncoding[0] is not None
  339. # Call superclass
  340. self.reset()
  341. def reset(self):
  342. self.dataStream = self.charEncoding[0].codec_info.streamreader(self.rawStream, 'replace')
  343. HTMLUnicodeInputStream.reset(self)
  344. def openStream(self, source):
  345. """Produces a file object from source.
  346. source can be either a file object, local filename or a string.
  347. """
  348. # Already a file object
  349. if hasattr(source, 'read'):
  350. stream = source
  351. else:
  352. stream = BytesIO(source)
  353. try:
  354. stream.seek(stream.tell())
  355. except Exception:
  356. stream = BufferedStream(stream)
  357. return stream
  358. def determineEncoding(self, chardet=True):
  359. # BOMs take precedence over everything
  360. # This will also read past the BOM if present
  361. charEncoding = self.detectBOM(), "certain"
  362. if charEncoding[0] is not None:
  363. return charEncoding
  364. # If we've been overridden, we've been overridden
  365. charEncoding = lookupEncoding(self.override_encoding), "certain"
  366. if charEncoding[0] is not None:
  367. return charEncoding
  368. # Now check the transport layer
  369. charEncoding = lookupEncoding(self.transport_encoding), "certain"
  370. if charEncoding[0] is not None:
  371. return charEncoding
  372. # Look for meta elements with encoding information
  373. charEncoding = self.detectEncodingMeta(), "tentative"
  374. if charEncoding[0] is not None:
  375. return charEncoding
  376. # Parent document encoding
  377. charEncoding = lookupEncoding(self.same_origin_parent_encoding), "tentative"
  378. if charEncoding[0] is not None and not charEncoding[0].name.startswith("utf-16"):
  379. return charEncoding
  380. # "likely" encoding
  381. charEncoding = lookupEncoding(self.likely_encoding), "tentative"
  382. if charEncoding[0] is not None:
  383. return charEncoding
  384. # Guess with chardet, if available
  385. if chardet:
  386. try:
  387. from pip._vendor.chardet.universaldetector import UniversalDetector
  388. except ImportError:
  389. pass
  390. else:
  391. buffers = []
  392. detector = UniversalDetector()
  393. while not detector.done:
  394. buffer = self.rawStream.read(self.numBytesChardet)
  395. assert isinstance(buffer, bytes)
  396. if not buffer:
  397. break
  398. buffers.append(buffer)
  399. detector.feed(buffer)
  400. detector.close()
  401. encoding = lookupEncoding(detector.result['encoding'])
  402. self.rawStream.seek(0)
  403. if encoding is not None:
  404. return encoding, "tentative"
  405. # Try the default encoding
  406. charEncoding = lookupEncoding(self.default_encoding), "tentative"
  407. if charEncoding[0] is not None:
  408. return charEncoding
  409. # Fallback to html5lib's default if even that hasn't worked
  410. return lookupEncoding("windows-1252"), "tentative"
  411. def changeEncoding(self, newEncoding):
  412. assert self.charEncoding[1] != "certain"
  413. newEncoding = lookupEncoding(newEncoding)
  414. if newEncoding is None:
  415. return
  416. if newEncoding.name in ("utf-16be", "utf-16le"):
  417. newEncoding = lookupEncoding("utf-8")
  418. assert newEncoding is not None
  419. elif newEncoding == self.charEncoding[0]:
  420. self.charEncoding = (self.charEncoding[0], "certain")
  421. else:
  422. self.rawStream.seek(0)
  423. self.charEncoding = (newEncoding, "certain")
  424. self.reset()
  425. raise _ReparseException("Encoding changed from %s to %s" % (self.charEncoding[0], newEncoding))
  426. def detectBOM(self):
  427. """Attempts to detect at BOM at the start of the stream. If
  428. an encoding can be determined from the BOM return the name of the
  429. encoding otherwise return None"""
  430. bomDict = {
  431. codecs.BOM_UTF8: 'utf-8',
  432. codecs.BOM_UTF16_LE: 'utf-16le', codecs.BOM_UTF16_BE: 'utf-16be',
  433. codecs.BOM_UTF32_LE: 'utf-32le', codecs.BOM_UTF32_BE: 'utf-32be'
  434. }
  435. # Go to beginning of file and read in 4 bytes
  436. string = self.rawStream.read(4)
  437. assert isinstance(string, bytes)
  438. # Try detecting the BOM using bytes from the string
  439. encoding = bomDict.get(string[:3]) # UTF-8
  440. seek = 3
  441. if not encoding:
  442. # Need to detect UTF-32 before UTF-16
  443. encoding = bomDict.get(string) # UTF-32
  444. seek = 4
  445. if not encoding:
  446. encoding = bomDict.get(string[:2]) # UTF-16
  447. seek = 2
  448. # Set the read position past the BOM if one was found, otherwise
  449. # set it to the start of the stream
  450. if encoding:
  451. self.rawStream.seek(seek)
  452. return lookupEncoding(encoding)
  453. else:
  454. self.rawStream.seek(0)
  455. return None
  456. def detectEncodingMeta(self):
  457. """Report the encoding declared by the meta element
  458. """
  459. buffer = self.rawStream.read(self.numBytesMeta)
  460. assert isinstance(buffer, bytes)
  461. parser = EncodingParser(buffer)
  462. self.rawStream.seek(0)
  463. encoding = parser.getEncoding()
  464. if encoding is not None and encoding.name in ("utf-16be", "utf-16le"):
  465. encoding = lookupEncoding("utf-8")
  466. return encoding
  467. class EncodingBytes(bytes):
  468. """String-like object with an associated position and various extra methods
  469. If the position is ever greater than the string length then an exception is
  470. raised"""
  471. def __new__(self, value):
  472. assert isinstance(value, bytes)
  473. return bytes.__new__(self, value.lower())
  474. def __init__(self, value):
  475. # pylint:disable=unused-argument
  476. self._position = -1
  477. def __iter__(self):
  478. return self
  479. def __next__(self):
  480. p = self._position = self._position + 1
  481. if p >= len(self):
  482. raise StopIteration
  483. elif p < 0:
  484. raise TypeError
  485. return self[p:p + 1]
  486. def next(self):
  487. # Py2 compat
  488. return self.__next__()
  489. def previous(self):
  490. p = self._position
  491. if p >= len(self):
  492. raise StopIteration
  493. elif p < 0:
  494. raise TypeError
  495. self._position = p = p - 1
  496. return self[p:p + 1]
  497. def setPosition(self, position):
  498. if self._position >= len(self):
  499. raise StopIteration
  500. self._position = position
  501. def getPosition(self):
  502. if self._position >= len(self):
  503. raise StopIteration
  504. if self._position >= 0:
  505. return self._position
  506. else:
  507. return None
  508. position = property(getPosition, setPosition)
  509. def getCurrentByte(self):
  510. return self[self.position:self.position + 1]
  511. currentByte = property(getCurrentByte)
  512. def skip(self, chars=spaceCharactersBytes):
  513. """Skip past a list of characters"""
  514. p = self.position # use property for the error-checking
  515. while p < len(self):
  516. c = self[p:p + 1]
  517. if c not in chars:
  518. self._position = p
  519. return c
  520. p += 1
  521. self._position = p
  522. return None
  523. def skipUntil(self, chars):
  524. p = self.position
  525. while p < len(self):
  526. c = self[p:p + 1]
  527. if c in chars:
  528. self._position = p
  529. return c
  530. p += 1
  531. self._position = p
  532. return None
  533. def matchBytes(self, bytes):
  534. """Look for a sequence of bytes at the start of a string. If the bytes
  535. are found return True and advance the position to the byte after the
  536. match. Otherwise return False and leave the position alone"""
  537. rv = self.startswith(bytes, self.position)
  538. if rv:
  539. self.position += len(bytes)
  540. return rv
  541. def jumpTo(self, bytes):
  542. """Look for the next sequence of bytes matching a given sequence. If
  543. a match is found advance the position to the last byte of the match"""
  544. try:
  545. self._position = self.index(bytes, self.position) + len(bytes) - 1
  546. except ValueError:
  547. raise StopIteration
  548. return True
  549. class EncodingParser(object):
  550. """Mini parser for detecting character encoding from meta elements"""
  551. def __init__(self, data):
  552. """string - the data to work on for encoding detection"""
  553. self.data = EncodingBytes(data)
  554. self.encoding = None
  555. def getEncoding(self):
  556. if b"<meta" not in self.data:
  557. return None
  558. methodDispatch = (
  559. (b"<!--", self.handleComment),
  560. (b"<meta", self.handleMeta),
  561. (b"</", self.handlePossibleEndTag),
  562. (b"<!", self.handleOther),
  563. (b"<?", self.handleOther),
  564. (b"<", self.handlePossibleStartTag))
  565. for _ in self.data:
  566. keepParsing = True
  567. try:
  568. self.data.jumpTo(b"<")
  569. except StopIteration:
  570. break
  571. for key, method in methodDispatch:
  572. if self.data.matchBytes(key):
  573. try:
  574. keepParsing = method()
  575. break
  576. except StopIteration:
  577. keepParsing = False
  578. break
  579. if not keepParsing:
  580. break
  581. return self.encoding
  582. def handleComment(self):
  583. """Skip over comments"""
  584. return self.data.jumpTo(b"-->")
  585. def handleMeta(self):
  586. if self.data.currentByte not in spaceCharactersBytes:
  587. # if we have <meta not followed by a space so just keep going
  588. return True
  589. # We have a valid meta element we want to search for attributes
  590. hasPragma = False
  591. pendingEncoding = None
  592. while True:
  593. # Try to find the next attribute after the current position
  594. attr = self.getAttribute()
  595. if attr is None:
  596. return True
  597. else:
  598. if attr[0] == b"http-equiv":
  599. hasPragma = attr[1] == b"content-type"
  600. if hasPragma and pendingEncoding is not None:
  601. self.encoding = pendingEncoding
  602. return False
  603. elif attr[0] == b"charset":
  604. tentativeEncoding = attr[1]
  605. codec = lookupEncoding(tentativeEncoding)
  606. if codec is not None:
  607. self.encoding = codec
  608. return False
  609. elif attr[0] == b"content":
  610. contentParser = ContentAttrParser(EncodingBytes(attr[1]))
  611. tentativeEncoding = contentParser.parse()
  612. if tentativeEncoding is not None:
  613. codec = lookupEncoding(tentativeEncoding)
  614. if codec is not None:
  615. if hasPragma:
  616. self.encoding = codec
  617. return False
  618. else:
  619. pendingEncoding = codec
  620. def handlePossibleStartTag(self):
  621. return self.handlePossibleTag(False)
  622. def handlePossibleEndTag(self):
  623. next(self.data)
  624. return self.handlePossibleTag(True)
  625. def handlePossibleTag(self, endTag):
  626. data = self.data
  627. if data.currentByte not in asciiLettersBytes:
  628. # If the next byte is not an ascii letter either ignore this
  629. # fragment (possible start tag case) or treat it according to
  630. # handleOther
  631. if endTag:
  632. data.previous()
  633. self.handleOther()
  634. return True
  635. c = data.skipUntil(spacesAngleBrackets)
  636. if c == b"<":
  637. # return to the first step in the overall "two step" algorithm
  638. # reprocessing the < byte
  639. data.previous()
  640. else:
  641. # Read all attributes
  642. attr = self.getAttribute()
  643. while attr is not None:
  644. attr = self.getAttribute()
  645. return True
  646. def handleOther(self):
  647. return self.data.jumpTo(b">")
  648. def getAttribute(self):
  649. """Return a name,value pair for the next attribute in the stream,
  650. if one is found, or None"""
  651. data = self.data
  652. # Step 1 (skip chars)
  653. c = data.skip(spaceCharactersBytes | frozenset([b"/"]))
  654. assert c is None or len(c) == 1
  655. # Step 2
  656. if c in (b">", None):
  657. return None
  658. # Step 3
  659. attrName = []
  660. attrValue = []
  661. # Step 4 attribute name
  662. while True:
  663. if c == b"=" and attrName:
  664. break
  665. elif c in spaceCharactersBytes:
  666. # Step 6!
  667. c = data.skip()
  668. break
  669. elif c in (b"/", b">"):
  670. return b"".join(attrName), b""
  671. elif c in asciiUppercaseBytes:
  672. attrName.append(c.lower())
  673. elif c is None:
  674. return None
  675. else:
  676. attrName.append(c)
  677. # Step 5
  678. c = next(data)
  679. # Step 7
  680. if c != b"=":
  681. data.previous()
  682. return b"".join(attrName), b""
  683. # Step 8
  684. next(data)
  685. # Step 9
  686. c = data.skip()
  687. # Step 10
  688. if c in (b"'", b'"'):
  689. # 10.1
  690. quoteChar = c
  691. while True:
  692. # 10.2
  693. c = next(data)
  694. # 10.3
  695. if c == quoteChar:
  696. next(data)
  697. return b"".join(attrName), b"".join(attrValue)
  698. # 10.4
  699. elif c in asciiUppercaseBytes:
  700. attrValue.append(c.lower())
  701. # 10.5
  702. else:
  703. attrValue.append(c)
  704. elif c == b">":
  705. return b"".join(attrName), b""
  706. elif c in asciiUppercaseBytes:
  707. attrValue.append(c.lower())
  708. elif c is None:
  709. return None
  710. else:
  711. attrValue.append(c)
  712. # Step 11
  713. while True:
  714. c = next(data)
  715. if c in spacesAngleBrackets:
  716. return b"".join(attrName), b"".join(attrValue)
  717. elif c in asciiUppercaseBytes:
  718. attrValue.append(c.lower())
  719. elif c is None:
  720. return None
  721. else:
  722. attrValue.append(c)
  723. class ContentAttrParser(object):
  724. def __init__(self, data):
  725. assert isinstance(data, bytes)
  726. self.data = data
  727. def parse(self):
  728. try:
  729. # Check if the attr name is charset
  730. # otherwise return
  731. self.data.jumpTo(b"charset")
  732. self.data.position += 1
  733. self.data.skip()
  734. if not self.data.currentByte == b"=":
  735. # If there is no = sign keep looking for attrs
  736. return None
  737. self.data.position += 1
  738. self.data.skip()
  739. # Look for an encoding between matching quote marks
  740. if self.data.currentByte in (b'"', b"'"):
  741. quoteMark = self.data.currentByte
  742. self.data.position += 1
  743. oldPosition = self.data.position
  744. if self.data.jumpTo(quoteMark):
  745. return self.data[oldPosition:self.data.position]
  746. else:
  747. return None
  748. else:
  749. # Unquoted value
  750. oldPosition = self.data.position
  751. try:
  752. self.data.skipUntil(spaceCharactersBytes)
  753. return self.data[oldPosition:self.data.position]
  754. except StopIteration:
  755. # Return the whole remaining value
  756. return self.data[oldPosition:]
  757. except StopIteration:
  758. return None
  759. def lookupEncoding(encoding):
  760. """Return the python codec name corresponding to an encoding or None if the
  761. string doesn't correspond to a valid encoding."""
  762. if isinstance(encoding, bytes):
  763. try:
  764. encoding = encoding.decode("ascii")
  765. except UnicodeDecodeError:
  766. return None
  767. if encoding is not None:
  768. try:
  769. return webencodings.lookup(encoding)
  770. except AttributeError:
  771. return None
  772. else:
  773. return None