tests.py 6.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153
  1. # coding: utf-8
  2. """
  3. webencodings.tests
  4. ~~~~~~~~~~~~~~~~~~
  5. A basic test suite for Encoding.
  6. :copyright: Copyright 2012 by Simon Sapin
  7. :license: BSD, see LICENSE for details.
  8. """
  9. from __future__ import unicode_literals
  10. from . import (lookup, LABELS, decode, encode, iter_decode, iter_encode,
  11. IncrementalDecoder, IncrementalEncoder, UTF8)
  12. def assert_raises(exception, function, *args, **kwargs):
  13. try:
  14. function(*args, **kwargs)
  15. except exception:
  16. return
  17. else: # pragma: no cover
  18. raise AssertionError('Did not raise %s.' % exception)
  19. def test_labels():
  20. assert lookup('utf-8').name == 'utf-8'
  21. assert lookup('Utf-8').name == 'utf-8'
  22. assert lookup('UTF-8').name == 'utf-8'
  23. assert lookup('utf8').name == 'utf-8'
  24. assert lookup('utf8').name == 'utf-8'
  25. assert lookup('utf8 ').name == 'utf-8'
  26. assert lookup(' \r\nutf8\t').name == 'utf-8'
  27. assert lookup('u8') is None # Python label.
  28. assert lookup('utf-8 ') is None # Non-ASCII white space.
  29. assert lookup('US-ASCII').name == 'windows-1252'
  30. assert lookup('iso-8859-1').name == 'windows-1252'
  31. assert lookup('latin1').name == 'windows-1252'
  32. assert lookup('LATIN1').name == 'windows-1252'
  33. assert lookup('latin-1') is None
  34. assert lookup('LATİN1') is None # ASCII-only case insensitivity.
  35. def test_all_labels():
  36. for label in LABELS:
  37. assert decode(b'', label) == ('', lookup(label))
  38. assert encode('', label) == b''
  39. for repeat in [0, 1, 12]:
  40. output, _ = iter_decode([b''] * repeat, label)
  41. assert list(output) == []
  42. assert list(iter_encode([''] * repeat, label)) == []
  43. decoder = IncrementalDecoder(label)
  44. assert decoder.decode(b'') == ''
  45. assert decoder.decode(b'', final=True) == ''
  46. encoder = IncrementalEncoder(label)
  47. assert encoder.encode('') == b''
  48. assert encoder.encode('', final=True) == b''
  49. # All encoding names are valid labels too:
  50. for name in set(LABELS.values()):
  51. assert lookup(name).name == name
  52. def test_invalid_label():
  53. assert_raises(LookupError, decode, b'\xEF\xBB\xBF\xc3\xa9', 'invalid')
  54. assert_raises(LookupError, encode, 'é', 'invalid')
  55. assert_raises(LookupError, iter_decode, [], 'invalid')
  56. assert_raises(LookupError, iter_encode, [], 'invalid')
  57. assert_raises(LookupError, IncrementalDecoder, 'invalid')
  58. assert_raises(LookupError, IncrementalEncoder, 'invalid')
  59. def test_decode():
  60. assert decode(b'\x80', 'latin1') == ('€', lookup('latin1'))
  61. assert decode(b'\x80', lookup('latin1')) == ('€', lookup('latin1'))
  62. assert decode(b'\xc3\xa9', 'utf8') == ('é', lookup('utf8'))
  63. assert decode(b'\xc3\xa9', UTF8) == ('é', lookup('utf8'))
  64. assert decode(b'\xc3\xa9', 'ascii') == ('é', lookup('ascii'))
  65. assert decode(b'\xEF\xBB\xBF\xc3\xa9', 'ascii') == ('é', lookup('utf8')) # UTF-8 with BOM
  66. assert decode(b'\xFE\xFF\x00\xe9', 'ascii') == ('é', lookup('utf-16be')) # UTF-16-BE with BOM
  67. assert decode(b'\xFF\xFE\xe9\x00', 'ascii') == ('é', lookup('utf-16le')) # UTF-16-LE with BOM
  68. assert decode(b'\xFE\xFF\xe9\x00', 'ascii') == ('\ue900', lookup('utf-16be'))
  69. assert decode(b'\xFF\xFE\x00\xe9', 'ascii') == ('\ue900', lookup('utf-16le'))
  70. assert decode(b'\x00\xe9', 'UTF-16BE') == ('é', lookup('utf-16be'))
  71. assert decode(b'\xe9\x00', 'UTF-16LE') == ('é', lookup('utf-16le'))
  72. assert decode(b'\xe9\x00', 'UTF-16') == ('é', lookup('utf-16le'))
  73. assert decode(b'\xe9\x00', 'UTF-16BE') == ('\ue900', lookup('utf-16be'))
  74. assert decode(b'\x00\xe9', 'UTF-16LE') == ('\ue900', lookup('utf-16le'))
  75. assert decode(b'\x00\xe9', 'UTF-16') == ('\ue900', lookup('utf-16le'))
  76. def test_encode():
  77. assert encode('é', 'latin1') == b'\xe9'
  78. assert encode('é', 'utf8') == b'\xc3\xa9'
  79. assert encode('é', 'utf8') == b'\xc3\xa9'
  80. assert encode('é', 'utf-16') == b'\xe9\x00'
  81. assert encode('é', 'utf-16le') == b'\xe9\x00'
  82. assert encode('é', 'utf-16be') == b'\x00\xe9'
  83. def test_iter_decode():
  84. def iter_decode_to_string(input, fallback_encoding):
  85. output, _encoding = iter_decode(input, fallback_encoding)
  86. return ''.join(output)
  87. assert iter_decode_to_string([], 'latin1') == ''
  88. assert iter_decode_to_string([b''], 'latin1') == ''
  89. assert iter_decode_to_string([b'\xe9'], 'latin1') == 'é'
  90. assert iter_decode_to_string([b'hello'], 'latin1') == 'hello'
  91. assert iter_decode_to_string([b'he', b'llo'], 'latin1') == 'hello'
  92. assert iter_decode_to_string([b'hell', b'o'], 'latin1') == 'hello'
  93. assert iter_decode_to_string([b'\xc3\xa9'], 'latin1') == 'é'
  94. assert iter_decode_to_string([b'\xEF\xBB\xBF\xc3\xa9'], 'latin1') == 'é'
  95. assert iter_decode_to_string([
  96. b'\xEF\xBB\xBF', b'\xc3', b'\xa9'], 'latin1') == 'é'
  97. assert iter_decode_to_string([
  98. b'\xEF\xBB\xBF', b'a', b'\xc3'], 'latin1') == 'a\uFFFD'
  99. assert iter_decode_to_string([
  100. b'', b'\xEF', b'', b'', b'\xBB\xBF\xc3', b'\xa9'], 'latin1') == 'é'
  101. assert iter_decode_to_string([b'\xEF\xBB\xBF'], 'latin1') == ''
  102. assert iter_decode_to_string([b'\xEF\xBB'], 'latin1') == 'ï»'
  103. assert iter_decode_to_string([b'\xFE\xFF\x00\xe9'], 'latin1') == 'é'
  104. assert iter_decode_to_string([b'\xFF\xFE\xe9\x00'], 'latin1') == 'é'
  105. assert iter_decode_to_string([
  106. b'', b'\xFF', b'', b'', b'\xFE\xe9', b'\x00'], 'latin1') == 'é'
  107. assert iter_decode_to_string([
  108. b'', b'h\xe9', b'llo'], 'x-user-defined') == 'h\uF7E9llo'
  109. def test_iter_encode():
  110. assert b''.join(iter_encode([], 'latin1')) == b''
  111. assert b''.join(iter_encode([''], 'latin1')) == b''
  112. assert b''.join(iter_encode(['é'], 'latin1')) == b'\xe9'
  113. assert b''.join(iter_encode(['', 'é', '', ''], 'latin1')) == b'\xe9'
  114. assert b''.join(iter_encode(['', 'é', '', ''], 'utf-16')) == b'\xe9\x00'
  115. assert b''.join(iter_encode(['', 'é', '', ''], 'utf-16le')) == b'\xe9\x00'
  116. assert b''.join(iter_encode(['', 'é', '', ''], 'utf-16be')) == b'\x00\xe9'
  117. assert b''.join(iter_encode([
  118. '', 'h\uF7E9', '', 'llo'], 'x-user-defined')) == b'h\xe9llo'
  119. def test_x_user_defined():
  120. encoded = b'2,\x0c\x0b\x1aO\xd9#\xcb\x0f\xc9\xbbt\xcf\xa8\xca'
  121. decoded = '2,\x0c\x0b\x1aO\uf7d9#\uf7cb\x0f\uf7c9\uf7bbt\uf7cf\uf7a8\uf7ca'
  122. encoded = b'aa'
  123. decoded = 'aa'
  124. assert decode(encoded, 'x-user-defined') == (decoded, lookup('x-user-defined'))
  125. assert encode(decoded, 'x-user-defined') == encoded