test_unicode.py 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362
  1. import numpy as np
  2. from numpy.testing import assert_, assert_equal, assert_array_equal
  3. def buffer_length(arr):
  4. if isinstance(arr, str):
  5. if not arr:
  6. charmax = 0
  7. else:
  8. charmax = max([ord(c) for c in arr])
  9. if charmax < 256:
  10. size = 1
  11. elif charmax < 65536:
  12. size = 2
  13. else:
  14. size = 4
  15. return size * len(arr)
  16. v = memoryview(arr)
  17. if v.shape is None:
  18. return len(v) * v.itemsize
  19. else:
  20. return np.prod(v.shape) * v.itemsize
  21. # In both cases below we need to make sure that the byte swapped value (as
  22. # UCS4) is still a valid unicode:
  23. # Value that can be represented in UCS2 interpreters
  24. ucs2_value = u'\u0900'
  25. # Value that cannot be represented in UCS2 interpreters (but can in UCS4)
  26. ucs4_value = u'\U00100900'
  27. def test_string_cast():
  28. str_arr = np.array(["1234", "1234\0\0"], dtype='S')
  29. uni_arr1 = str_arr.astype('>U')
  30. uni_arr2 = str_arr.astype('<U')
  31. assert_(str_arr != uni_arr1)
  32. assert_(str_arr != uni_arr2)
  33. assert_array_equal(uni_arr1, uni_arr2)
  34. ############################################################
  35. # Creation tests
  36. ############################################################
  37. class CreateZeros:
  38. """Check the creation of zero-valued arrays"""
  39. def content_check(self, ua, ua_scalar, nbytes):
  40. # Check the length of the unicode base type
  41. assert_(int(ua.dtype.str[2:]) == self.ulen)
  42. # Check the length of the data buffer
  43. assert_(buffer_length(ua) == nbytes)
  44. # Small check that data in array element is ok
  45. assert_(ua_scalar == u'')
  46. # Encode to ascii and double check
  47. assert_(ua_scalar.encode('ascii') == b'')
  48. # Check buffer lengths for scalars
  49. assert_(buffer_length(ua_scalar) == 0)
  50. def test_zeros0D(self):
  51. # Check creation of 0-dimensional objects
  52. ua = np.zeros((), dtype='U%s' % self.ulen)
  53. self.content_check(ua, ua[()], 4*self.ulen)
  54. def test_zerosSD(self):
  55. # Check creation of single-dimensional objects
  56. ua = np.zeros((2,), dtype='U%s' % self.ulen)
  57. self.content_check(ua, ua[0], 4*self.ulen*2)
  58. self.content_check(ua, ua[1], 4*self.ulen*2)
  59. def test_zerosMD(self):
  60. # Check creation of multi-dimensional objects
  61. ua = np.zeros((2, 3, 4), dtype='U%s' % self.ulen)
  62. self.content_check(ua, ua[0, 0, 0], 4*self.ulen*2*3*4)
  63. self.content_check(ua, ua[-1, -1, -1], 4*self.ulen*2*3*4)
  64. class TestCreateZeros_1(CreateZeros):
  65. """Check the creation of zero-valued arrays (size 1)"""
  66. ulen = 1
  67. class TestCreateZeros_2(CreateZeros):
  68. """Check the creation of zero-valued arrays (size 2)"""
  69. ulen = 2
  70. class TestCreateZeros_1009(CreateZeros):
  71. """Check the creation of zero-valued arrays (size 1009)"""
  72. ulen = 1009
  73. class CreateValues:
  74. """Check the creation of unicode arrays with values"""
  75. def content_check(self, ua, ua_scalar, nbytes):
  76. # Check the length of the unicode base type
  77. assert_(int(ua.dtype.str[2:]) == self.ulen)
  78. # Check the length of the data buffer
  79. assert_(buffer_length(ua) == nbytes)
  80. # Small check that data in array element is ok
  81. assert_(ua_scalar == self.ucs_value*self.ulen)
  82. # Encode to UTF-8 and double check
  83. assert_(ua_scalar.encode('utf-8') ==
  84. (self.ucs_value*self.ulen).encode('utf-8'))
  85. # Check buffer lengths for scalars
  86. if self.ucs_value == ucs4_value:
  87. # In UCS2, the \U0010FFFF will be represented using a
  88. # surrogate *pair*
  89. assert_(buffer_length(ua_scalar) == 2*2*self.ulen)
  90. else:
  91. # In UCS2, the \uFFFF will be represented using a
  92. # regular 2-byte word
  93. assert_(buffer_length(ua_scalar) == 2*self.ulen)
  94. def test_values0D(self):
  95. # Check creation of 0-dimensional objects with values
  96. ua = np.array(self.ucs_value*self.ulen, dtype='U%s' % self.ulen)
  97. self.content_check(ua, ua[()], 4*self.ulen)
  98. def test_valuesSD(self):
  99. # Check creation of single-dimensional objects with values
  100. ua = np.array([self.ucs_value*self.ulen]*2, dtype='U%s' % self.ulen)
  101. self.content_check(ua, ua[0], 4*self.ulen*2)
  102. self.content_check(ua, ua[1], 4*self.ulen*2)
  103. def test_valuesMD(self):
  104. # Check creation of multi-dimensional objects with values
  105. ua = np.array([[[self.ucs_value*self.ulen]*2]*3]*4, dtype='U%s' % self.ulen)
  106. self.content_check(ua, ua[0, 0, 0], 4*self.ulen*2*3*4)
  107. self.content_check(ua, ua[-1, -1, -1], 4*self.ulen*2*3*4)
  108. class TestCreateValues_1_UCS2(CreateValues):
  109. """Check the creation of valued arrays (size 1, UCS2 values)"""
  110. ulen = 1
  111. ucs_value = ucs2_value
  112. class TestCreateValues_1_UCS4(CreateValues):
  113. """Check the creation of valued arrays (size 1, UCS4 values)"""
  114. ulen = 1
  115. ucs_value = ucs4_value
  116. class TestCreateValues_2_UCS2(CreateValues):
  117. """Check the creation of valued arrays (size 2, UCS2 values)"""
  118. ulen = 2
  119. ucs_value = ucs2_value
  120. class TestCreateValues_2_UCS4(CreateValues):
  121. """Check the creation of valued arrays (size 2, UCS4 values)"""
  122. ulen = 2
  123. ucs_value = ucs4_value
  124. class TestCreateValues_1009_UCS2(CreateValues):
  125. """Check the creation of valued arrays (size 1009, UCS2 values)"""
  126. ulen = 1009
  127. ucs_value = ucs2_value
  128. class TestCreateValues_1009_UCS4(CreateValues):
  129. """Check the creation of valued arrays (size 1009, UCS4 values)"""
  130. ulen = 1009
  131. ucs_value = ucs4_value
  132. ############################################################
  133. # Assignment tests
  134. ############################################################
  135. class AssignValues:
  136. """Check the assignment of unicode arrays with values"""
  137. def content_check(self, ua, ua_scalar, nbytes):
  138. # Check the length of the unicode base type
  139. assert_(int(ua.dtype.str[2:]) == self.ulen)
  140. # Check the length of the data buffer
  141. assert_(buffer_length(ua) == nbytes)
  142. # Small check that data in array element is ok
  143. assert_(ua_scalar == self.ucs_value*self.ulen)
  144. # Encode to UTF-8 and double check
  145. assert_(ua_scalar.encode('utf-8') ==
  146. (self.ucs_value*self.ulen).encode('utf-8'))
  147. # Check buffer lengths for scalars
  148. if self.ucs_value == ucs4_value:
  149. # In UCS2, the \U0010FFFF will be represented using a
  150. # surrogate *pair*
  151. assert_(buffer_length(ua_scalar) == 2*2*self.ulen)
  152. else:
  153. # In UCS2, the \uFFFF will be represented using a
  154. # regular 2-byte word
  155. assert_(buffer_length(ua_scalar) == 2*self.ulen)
  156. def test_values0D(self):
  157. # Check assignment of 0-dimensional objects with values
  158. ua = np.zeros((), dtype='U%s' % self.ulen)
  159. ua[()] = self.ucs_value*self.ulen
  160. self.content_check(ua, ua[()], 4*self.ulen)
  161. def test_valuesSD(self):
  162. # Check assignment of single-dimensional objects with values
  163. ua = np.zeros((2,), dtype='U%s' % self.ulen)
  164. ua[0] = self.ucs_value*self.ulen
  165. self.content_check(ua, ua[0], 4*self.ulen*2)
  166. ua[1] = self.ucs_value*self.ulen
  167. self.content_check(ua, ua[1], 4*self.ulen*2)
  168. def test_valuesMD(self):
  169. # Check assignment of multi-dimensional objects with values
  170. ua = np.zeros((2, 3, 4), dtype='U%s' % self.ulen)
  171. ua[0, 0, 0] = self.ucs_value*self.ulen
  172. self.content_check(ua, ua[0, 0, 0], 4*self.ulen*2*3*4)
  173. ua[-1, -1, -1] = self.ucs_value*self.ulen
  174. self.content_check(ua, ua[-1, -1, -1], 4*self.ulen*2*3*4)
  175. class TestAssignValues_1_UCS2(AssignValues):
  176. """Check the assignment of valued arrays (size 1, UCS2 values)"""
  177. ulen = 1
  178. ucs_value = ucs2_value
  179. class TestAssignValues_1_UCS4(AssignValues):
  180. """Check the assignment of valued arrays (size 1, UCS4 values)"""
  181. ulen = 1
  182. ucs_value = ucs4_value
  183. class TestAssignValues_2_UCS2(AssignValues):
  184. """Check the assignment of valued arrays (size 2, UCS2 values)"""
  185. ulen = 2
  186. ucs_value = ucs2_value
  187. class TestAssignValues_2_UCS4(AssignValues):
  188. """Check the assignment of valued arrays (size 2, UCS4 values)"""
  189. ulen = 2
  190. ucs_value = ucs4_value
  191. class TestAssignValues_1009_UCS2(AssignValues):
  192. """Check the assignment of valued arrays (size 1009, UCS2 values)"""
  193. ulen = 1009
  194. ucs_value = ucs2_value
  195. class TestAssignValues_1009_UCS4(AssignValues):
  196. """Check the assignment of valued arrays (size 1009, UCS4 values)"""
  197. ulen = 1009
  198. ucs_value = ucs4_value
  199. ############################################################
  200. # Byteorder tests
  201. ############################################################
  202. class ByteorderValues:
  203. """Check the byteorder of unicode arrays in round-trip conversions"""
  204. def test_values0D(self):
  205. # Check byteorder of 0-dimensional objects
  206. ua = np.array(self.ucs_value*self.ulen, dtype='U%s' % self.ulen)
  207. ua2 = ua.newbyteorder()
  208. # This changes the interpretation of the data region (but not the
  209. # actual data), therefore the returned scalars are not
  210. # the same (they are byte-swapped versions of each other).
  211. assert_(ua[()] != ua2[()])
  212. ua3 = ua2.newbyteorder()
  213. # Arrays must be equal after the round-trip
  214. assert_equal(ua, ua3)
  215. def test_valuesSD(self):
  216. # Check byteorder of single-dimensional objects
  217. ua = np.array([self.ucs_value*self.ulen]*2, dtype='U%s' % self.ulen)
  218. ua2 = ua.newbyteorder()
  219. assert_((ua != ua2).all())
  220. assert_(ua[-1] != ua2[-1])
  221. ua3 = ua2.newbyteorder()
  222. # Arrays must be equal after the round-trip
  223. assert_equal(ua, ua3)
  224. def test_valuesMD(self):
  225. # Check byteorder of multi-dimensional objects
  226. ua = np.array([[[self.ucs_value*self.ulen]*2]*3]*4,
  227. dtype='U%s' % self.ulen)
  228. ua2 = ua.newbyteorder()
  229. assert_((ua != ua2).all())
  230. assert_(ua[-1, -1, -1] != ua2[-1, -1, -1])
  231. ua3 = ua2.newbyteorder()
  232. # Arrays must be equal after the round-trip
  233. assert_equal(ua, ua3)
  234. def test_values_cast(self):
  235. # Check byteorder of when casting the array for a strided and
  236. # contiguous array:
  237. test1 = np.array([self.ucs_value*self.ulen]*2, dtype='U%s' % self.ulen)
  238. test2 = np.repeat(test1, 2)[::2]
  239. for ua in (test1, test2):
  240. ua2 = ua.astype(dtype=ua.dtype.newbyteorder())
  241. assert_((ua == ua2).all())
  242. assert_(ua[-1] == ua2[-1])
  243. ua3 = ua2.astype(dtype=ua.dtype)
  244. # Arrays must be equal after the round-trip
  245. assert_equal(ua, ua3)
  246. def test_values_updowncast(self):
  247. # Check byteorder of when casting the array to a longer and shorter
  248. # string length for strided and contiguous arrays
  249. test1 = np.array([self.ucs_value*self.ulen]*2, dtype='U%s' % self.ulen)
  250. test2 = np.repeat(test1, 2)[::2]
  251. for ua in (test1, test2):
  252. # Cast to a longer type with zero padding
  253. longer_type = np.dtype('U%s' % (self.ulen+1)).newbyteorder()
  254. ua2 = ua.astype(dtype=longer_type)
  255. assert_((ua == ua2).all())
  256. assert_(ua[-1] == ua2[-1])
  257. # Cast back again with truncating:
  258. ua3 = ua2.astype(dtype=ua.dtype)
  259. # Arrays must be equal after the round-trip
  260. assert_equal(ua, ua3)
  261. class TestByteorder_1_UCS2(ByteorderValues):
  262. """Check the byteorder in unicode (size 1, UCS2 values)"""
  263. ulen = 1
  264. ucs_value = ucs2_value
  265. class TestByteorder_1_UCS4(ByteorderValues):
  266. """Check the byteorder in unicode (size 1, UCS4 values)"""
  267. ulen = 1
  268. ucs_value = ucs4_value
  269. class TestByteorder_2_UCS2(ByteorderValues):
  270. """Check the byteorder in unicode (size 2, UCS2 values)"""
  271. ulen = 2
  272. ucs_value = ucs2_value
  273. class TestByteorder_2_UCS4(ByteorderValues):
  274. """Check the byteorder in unicode (size 2, UCS4 values)"""
  275. ulen = 2
  276. ucs_value = ucs4_value
  277. class TestByteorder_1009_UCS2(ByteorderValues):
  278. """Check the byteorder in unicode (size 1009, UCS2 values)"""
  279. ulen = 1009
  280. ucs_value = ucs2_value
  281. class TestByteorder_1009_UCS4(ByteorderValues):
  282. """Check the byteorder in unicode (size 1009, UCS4 values)"""
  283. ulen = 1009
  284. ucs_value = ucs4_value