mul.c 30 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902
  1. /* Copyright 2008, Google Inc.
  2. * All rights reserved.
  3. *
  4. * Redistribution and use in source and binary forms, with or without
  5. * modification, are permitted provided that the following conditions are
  6. * met:
  7. *
  8. * * Redistributions of source code must retain the above copyright
  9. * notice, this list of conditions and the following disclaimer.
  10. * * Redistributions in binary form must reproduce the above
  11. * copyright notice, this list of conditions and the following disclaimer
  12. * in the documentation and/or other materials provided with the
  13. * distribution.
  14. * * Neither the name of Google Inc. nor the names of its
  15. * contributors may be used to endorse or promote products derived from
  16. * this software without specific prior written permission.
  17. *
  18. * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  19. * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  20. * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  21. * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  22. * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  23. * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  24. * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  25. * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  26. * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  27. * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  28. * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  29. *
  30. * curve25519-donna: Curve25519 elliptic curve, public key function
  31. *
  32. * http://code.google.com/p/curve25519-donna/
  33. *
  34. * Adam Langley <agl@imperialviolet.org>
  35. *
  36. * Derived from public domain C code by Daniel J. Bernstein <djb@cr.yp.to>
  37. *
  38. * More information about curve25519 can be found here
  39. * http://cr.yp.to/ecdh.html
  40. *
  41. * djb's sample implementation of curve25519 is written in a special assembly
  42. * language called qhasm and uses the floating point registers.
  43. *
  44. * This is, almost, a clean room reimplementation from the curve25519 paper. It
  45. * uses many of the tricks described therein. Only the crecip function is taken
  46. * from the sample implementation.
  47. */
  48. #include <string.h>
  49. #include <stdint.h>
  50. #include <stdlib.h>
  51. #include <stdio.h>
  52. #include <errno.h>
  53. #ifdef _MSC_VER
  54. #define inline __inline
  55. #endif
  56. typedef uint8_t u8;
  57. typedef int32_t s32;
  58. typedef int64_t limb;
  59. /* Field element representation:
  60. *
  61. * Field elements are written as an array of signed, 64-bit limbs, least
  62. * significant first. The value of the field element is:
  63. * x[0] + 2^26·x[1] + x^51·x[2] + 2^102·x[3] + ...
  64. *
  65. * i.e. the limbs are 26, 25, 26, 25, ... bits wide.
  66. */
  67. /* Sum two numbers: output += in */
  68. static void fsum(limb *output, const limb *in)
  69. {
  70. unsigned i;
  71. for (i = 0; i < 10; i += 2)
  72. {
  73. output[0 + i] = (output[0 + i] + in[0 + i]);
  74. output[1 + i] = (output[1 + i] + in[1 + i]);
  75. }
  76. }
  77. /* Find the difference of two numbers: output = in - output
  78. * (note the order of the arguments!)
  79. */
  80. static void fdifference(limb *output, const limb *in)
  81. {
  82. unsigned i;
  83. for (i = 0; i < 10; ++i)
  84. {
  85. output[i] = (in[i] - output[i]);
  86. }
  87. }
  88. /* Multiply a number by a scalar: output = in * scalar */
  89. static void fscalar_product(limb *output, const limb *in, const limb scalar)
  90. {
  91. unsigned i;
  92. for (i = 0; i < 10; ++i)
  93. {
  94. output[i] = in[i] * scalar;
  95. }
  96. }
  97. /* Multiply two numbers: output = in2 * in
  98. *
  99. * output must be distinct to both inputs. The inputs are reduced coefficient
  100. * form, the output is not.
  101. */
  102. static void fproduct(limb *output, const limb *in2, const limb *in)
  103. {
  104. output[0] = ((limb)((s32)in2[0])) * ((s32)in[0]);
  105. output[1] = ((limb)((s32)in2[0])) * ((s32)in[1]) +
  106. ((limb)((s32)in2[1])) * ((s32)in[0]);
  107. output[2] = 2 * ((limb)((s32)in2[1])) * ((s32)in[1]) +
  108. ((limb)((s32)in2[0])) * ((s32)in[2]) +
  109. ((limb)((s32)in2[2])) * ((s32)in[0]);
  110. output[3] = ((limb)((s32)in2[1])) * ((s32)in[2]) +
  111. ((limb)((s32)in2[2])) * ((s32)in[1]) +
  112. ((limb)((s32)in2[0])) * ((s32)in[3]) +
  113. ((limb)((s32)in2[3])) * ((s32)in[0]);
  114. output[4] = ((limb)((s32)in2[2])) * ((s32)in[2]) +
  115. 2 * (((limb)((s32)in2[1])) * ((s32)in[3]) +
  116. ((limb)((s32)in2[3])) * ((s32)in[1])) +
  117. ((limb)((s32)in2[0])) * ((s32)in[4]) +
  118. ((limb)((s32)in2[4])) * ((s32)in[0]);
  119. output[5] = ((limb)((s32)in2[2])) * ((s32)in[3]) +
  120. ((limb)((s32)in2[3])) * ((s32)in[2]) +
  121. ((limb)((s32)in2[1])) * ((s32)in[4]) +
  122. ((limb)((s32)in2[4])) * ((s32)in[1]) +
  123. ((limb)((s32)in2[0])) * ((s32)in[5]) +
  124. ((limb)((s32)in2[5])) * ((s32)in[0]);
  125. output[6] = 2 * (((limb)((s32)in2[3])) * ((s32)in[3]) +
  126. ((limb)((s32)in2[1])) * ((s32)in[5]) +
  127. ((limb)((s32)in2[5])) * ((s32)in[1])) +
  128. ((limb)((s32)in2[2])) * ((s32)in[4]) +
  129. ((limb)((s32)in2[4])) * ((s32)in[2]) +
  130. ((limb)((s32)in2[0])) * ((s32)in[6]) +
  131. ((limb)((s32)in2[6])) * ((s32)in[0]);
  132. output[7] = ((limb)((s32)in2[3])) * ((s32)in[4]) +
  133. ((limb)((s32)in2[4])) * ((s32)in[3]) +
  134. ((limb)((s32)in2[2])) * ((s32)in[5]) +
  135. ((limb)((s32)in2[5])) * ((s32)in[2]) +
  136. ((limb)((s32)in2[1])) * ((s32)in[6]) +
  137. ((limb)((s32)in2[6])) * ((s32)in[1]) +
  138. ((limb)((s32)in2[0])) * ((s32)in[7]) +
  139. ((limb)((s32)in2[7])) * ((s32)in[0]);
  140. output[8] = ((limb)((s32)in2[4])) * ((s32)in[4]) +
  141. 2 * (((limb)((s32)in2[3])) * ((s32)in[5]) +
  142. ((limb)((s32)in2[5])) * ((s32)in[3]) +
  143. ((limb)((s32)in2[1])) * ((s32)in[7]) +
  144. ((limb)((s32)in2[7])) * ((s32)in[1])) +
  145. ((limb)((s32)in2[2])) * ((s32)in[6]) +
  146. ((limb)((s32)in2[6])) * ((s32)in[2]) +
  147. ((limb)((s32)in2[0])) * ((s32)in[8]) +
  148. ((limb)((s32)in2[8])) * ((s32)in[0]);
  149. output[9] = ((limb)((s32)in2[4])) * ((s32)in[5]) +
  150. ((limb)((s32)in2[5])) * ((s32)in[4]) +
  151. ((limb)((s32)in2[3])) * ((s32)in[6]) +
  152. ((limb)((s32)in2[6])) * ((s32)in[3]) +
  153. ((limb)((s32)in2[2])) * ((s32)in[7]) +
  154. ((limb)((s32)in2[7])) * ((s32)in[2]) +
  155. ((limb)((s32)in2[1])) * ((s32)in[8]) +
  156. ((limb)((s32)in2[8])) * ((s32)in[1]) +
  157. ((limb)((s32)in2[0])) * ((s32)in[9]) +
  158. ((limb)((s32)in2[9])) * ((s32)in[0]);
  159. output[10] = 2 * (((limb)((s32)in2[5])) * ((s32)in[5]) +
  160. ((limb)((s32)in2[3])) * ((s32)in[7]) +
  161. ((limb)((s32)in2[7])) * ((s32)in[3]) +
  162. ((limb)((s32)in2[1])) * ((s32)in[9]) +
  163. ((limb)((s32)in2[9])) * ((s32)in[1])) +
  164. ((limb)((s32)in2[4])) * ((s32)in[6]) +
  165. ((limb)((s32)in2[6])) * ((s32)in[4]) +
  166. ((limb)((s32)in2[2])) * ((s32)in[8]) +
  167. ((limb)((s32)in2[8])) * ((s32)in[2]);
  168. output[11] = ((limb)((s32)in2[5])) * ((s32)in[6]) +
  169. ((limb)((s32)in2[6])) * ((s32)in[5]) +
  170. ((limb)((s32)in2[4])) * ((s32)in[7]) +
  171. ((limb)((s32)in2[7])) * ((s32)in[4]) +
  172. ((limb)((s32)in2[3])) * ((s32)in[8]) +
  173. ((limb)((s32)in2[8])) * ((s32)in[3]) +
  174. ((limb)((s32)in2[2])) * ((s32)in[9]) +
  175. ((limb)((s32)in2[9])) * ((s32)in[2]);
  176. output[12] = ((limb)((s32)in2[6])) * ((s32)in[6]) +
  177. 2 * (((limb)((s32)in2[5])) * ((s32)in[7]) +
  178. ((limb)((s32)in2[7])) * ((s32)in[5]) +
  179. ((limb)((s32)in2[3])) * ((s32)in[9]) +
  180. ((limb)((s32)in2[9])) * ((s32)in[3])) +
  181. ((limb)((s32)in2[4])) * ((s32)in[8]) +
  182. ((limb)((s32)in2[8])) * ((s32)in[4]);
  183. output[13] = ((limb)((s32)in2[6])) * ((s32)in[7]) +
  184. ((limb)((s32)in2[7])) * ((s32)in[6]) +
  185. ((limb)((s32)in2[5])) * ((s32)in[8]) +
  186. ((limb)((s32)in2[8])) * ((s32)in[5]) +
  187. ((limb)((s32)in2[4])) * ((s32)in[9]) +
  188. ((limb)((s32)in2[9])) * ((s32)in[4]);
  189. output[14] = 2 * (((limb)((s32)in2[7])) * ((s32)in[7]) +
  190. ((limb)((s32)in2[5])) * ((s32)in[9]) +
  191. ((limb)((s32)in2[9])) * ((s32)in[5])) +
  192. ((limb)((s32)in2[6])) * ((s32)in[8]) +
  193. ((limb)((s32)in2[8])) * ((s32)in[6]);
  194. output[15] = ((limb)((s32)in2[7])) * ((s32)in[8]) +
  195. ((limb)((s32)in2[8])) * ((s32)in[7]) +
  196. ((limb)((s32)in2[6])) * ((s32)in[9]) +
  197. ((limb)((s32)in2[9])) * ((s32)in[6]);
  198. output[16] = ((limb)((s32)in2[8])) * ((s32)in[8]) +
  199. 2 * (((limb)((s32)in2[7])) * ((s32)in[9]) +
  200. ((limb)((s32)in2[9])) * ((s32)in[7]));
  201. output[17] = ((limb)((s32)in2[8])) * ((s32)in[9]) +
  202. ((limb)((s32)in2[9])) * ((s32)in[8]);
  203. output[18] = 2 * ((limb)((s32)in2[9])) * ((s32)in[9]);
  204. }
  205. /* Reduce a long form to a short form by taking the input mod 2^255 - 19. */
  206. static void freduce_degree(limb *output)
  207. {
  208. /* Each of these shifts and adds ends up multiplying the value by 19. */
  209. output[8] += output[18] << 4;
  210. output[8] += output[18] << 1;
  211. output[8] += output[18];
  212. output[7] += output[17] << 4;
  213. output[7] += output[17] << 1;
  214. output[7] += output[17];
  215. output[6] += output[16] << 4;
  216. output[6] += output[16] << 1;
  217. output[6] += output[16];
  218. output[5] += output[15] << 4;
  219. output[5] += output[15] << 1;
  220. output[5] += output[15];
  221. output[4] += output[14] << 4;
  222. output[4] += output[14] << 1;
  223. output[4] += output[14];
  224. output[3] += output[13] << 4;
  225. output[3] += output[13] << 1;
  226. output[3] += output[13];
  227. output[2] += output[12] << 4;
  228. output[2] += output[12] << 1;
  229. output[2] += output[12];
  230. output[1] += output[11] << 4;
  231. output[1] += output[11] << 1;
  232. output[1] += output[11];
  233. output[0] += output[10] << 4;
  234. output[0] += output[10] << 1;
  235. output[0] += output[10];
  236. }
  237. #if (-1 & 3) != 3
  238. #error "This code only works on a two's complement system"
  239. #endif
  240. /* return v / 2^26, using only shifts and adds. */
  241. static inline limb
  242. div_by_2_26(const limb v)
  243. {
  244. /* High word of v; no shift needed*/
  245. const uint32_t highword = (uint32_t)(((uint64_t)v) >> 32);
  246. /* Set to all 1s if v was negative; else set to 0s. */
  247. const int32_t sign = ((int32_t)highword) >> 31;
  248. /* Set to 0x3ffffff if v was negative; else set to 0. */
  249. const int32_t roundoff = ((uint32_t)sign) >> 6;
  250. /* Should return v / (1<<26) */
  251. return (v + roundoff) >> 26;
  252. }
  253. /* return v / (2^25), using only shifts and adds. */
  254. static inline limb
  255. div_by_2_25(const limb v)
  256. {
  257. /* High word of v; no shift needed*/
  258. const uint32_t highword = (uint32_t)(((uint64_t)v) >> 32);
  259. /* Set to all 1s if v was negative; else set to 0s. */
  260. const int32_t sign = ((int32_t)highword) >> 31;
  261. /* Set to 0x1ffffff if v was negative; else set to 0. */
  262. const int32_t roundoff = ((uint32_t)sign) >> 7;
  263. /* Should return v / (1<<25) */
  264. return (v + roundoff) >> 25;
  265. }
  266. static inline s32
  267. div_s32_by_2_25(const s32 v)
  268. {
  269. const s32 roundoff = ((uint32_t)(v >> 31)) >> 7;
  270. return (v + roundoff) >> 25;
  271. }
  272. /* Reduce all coefficients of the short form input so that |x| < 2^26.
  273. *
  274. * On entry: |output[i]| < 2^62
  275. */
  276. static void freduce_coefficients(limb *output)
  277. {
  278. unsigned i;
  279. output[10] = 0;
  280. for (i = 0; i < 10; i += 2)
  281. {
  282. limb over = div_by_2_26(output[i]);
  283. output[i] -= over << 26;
  284. output[i + 1] += over;
  285. over = div_by_2_25(output[i + 1]);
  286. output[i + 1] -= over << 25;
  287. output[i + 2] += over;
  288. }
  289. /* Now |output[10]| < 2 ^ 38 and all other coefficients are reduced. */
  290. output[0] += output[10] << 4;
  291. output[0] += output[10] << 1;
  292. output[0] += output[10];
  293. output[10] = 0;
  294. /* Now output[1..9] are reduced, and |output[0]| < 2^26 + 19 * 2^38
  295. * So |over| will be no more than 77825 */
  296. {
  297. limb over = div_by_2_26(output[0]);
  298. output[0] -= over << 26;
  299. output[1] += over;
  300. }
  301. /* Now output[0,2..9] are reduced, and |output[1]| < 2^25 + 77825
  302. * So |over| will be no more than 1. */
  303. {
  304. /* output[1] fits in 32 bits, so we can use div_s32_by_2_25 here. */
  305. s32 over32 = div_s32_by_2_25((s32)output[1]);
  306. output[1] -= over32 << 25;
  307. output[2] += over32;
  308. }
  309. /* Finally, output[0,1,3..9] are reduced, and output[2] is "nearly reduced":
  310. * we have |output[2]| <= 2^26. This is good enough for all of our math,
  311. * but it will require an extra freduce_coefficients before fcontract. */
  312. }
  313. /* A helpful wrapper around fproduct: output = in * in2.
  314. *
  315. * output must be distinct to both inputs. The output is reduced degree and
  316. * reduced coefficient.
  317. */
  318. static void
  319. fmul(limb *output, const limb *in, const limb *in2)
  320. {
  321. limb t[19];
  322. fproduct(t, in, in2);
  323. freduce_degree(t);
  324. freduce_coefficients(t);
  325. memcpy(output, t, sizeof(limb) * 10);
  326. }
  327. static void fsquare_inner(limb *output, const limb *in)
  328. {
  329. output[0] = ((limb)((s32)in[0])) * ((s32)in[0]);
  330. output[1] = 2 * ((limb)((s32)in[0])) * ((s32)in[1]);
  331. output[2] = 2 * (((limb)((s32)in[1])) * ((s32)in[1]) +
  332. ((limb)((s32)in[0])) * ((s32)in[2]));
  333. output[3] = 2 * (((limb)((s32)in[1])) * ((s32)in[2]) +
  334. ((limb)((s32)in[0])) * ((s32)in[3]));
  335. output[4] = ((limb)((s32)in[2])) * ((s32)in[2]) +
  336. 4 * ((limb)((s32)in[1])) * ((s32)in[3]) +
  337. 2 * ((limb)((s32)in[0])) * ((s32)in[4]);
  338. output[5] = 2 * (((limb)((s32)in[2])) * ((s32)in[3]) +
  339. ((limb)((s32)in[1])) * ((s32)in[4]) +
  340. ((limb)((s32)in[0])) * ((s32)in[5]));
  341. output[6] = 2 * (((limb)((s32)in[3])) * ((s32)in[3]) +
  342. ((limb)((s32)in[2])) * ((s32)in[4]) +
  343. ((limb)((s32)in[0])) * ((s32)in[6]) +
  344. 2 * ((limb)((s32)in[1])) * ((s32)in[5]));
  345. output[7] = 2 * (((limb)((s32)in[3])) * ((s32)in[4]) +
  346. ((limb)((s32)in[2])) * ((s32)in[5]) +
  347. ((limb)((s32)in[1])) * ((s32)in[6]) +
  348. ((limb)((s32)in[0])) * ((s32)in[7]));
  349. output[8] = ((limb)((s32)in[4])) * ((s32)in[4]) +
  350. 2 * (((limb)((s32)in[2])) * ((s32)in[6]) +
  351. ((limb)((s32)in[0])) * ((s32)in[8]) +
  352. 2 * (((limb)((s32)in[1])) * ((s32)in[7]) +
  353. ((limb)((s32)in[3])) * ((s32)in[5])));
  354. output[9] = 2 * (((limb)((s32)in[4])) * ((s32)in[5]) +
  355. ((limb)((s32)in[3])) * ((s32)in[6]) +
  356. ((limb)((s32)in[2])) * ((s32)in[7]) +
  357. ((limb)((s32)in[1])) * ((s32)in[8]) +
  358. ((limb)((s32)in[0])) * ((s32)in[9]));
  359. output[10] = 2 * (((limb)((s32)in[5])) * ((s32)in[5]) +
  360. ((limb)((s32)in[4])) * ((s32)in[6]) +
  361. ((limb)((s32)in[2])) * ((s32)in[8]) +
  362. 2 * (((limb)((s32)in[3])) * ((s32)in[7]) +
  363. ((limb)((s32)in[1])) * ((s32)in[9])));
  364. output[11] = 2 * (((limb)((s32)in[5])) * ((s32)in[6]) +
  365. ((limb)((s32)in[4])) * ((s32)in[7]) +
  366. ((limb)((s32)in[3])) * ((s32)in[8]) +
  367. ((limb)((s32)in[2])) * ((s32)in[9]));
  368. output[12] = ((limb)((s32)in[6])) * ((s32)in[6]) +
  369. 2 * (((limb)((s32)in[4])) * ((s32)in[8]) +
  370. 2 * (((limb)((s32)in[5])) * ((s32)in[7]) +
  371. ((limb)((s32)in[3])) * ((s32)in[9])));
  372. output[13] = 2 * (((limb)((s32)in[6])) * ((s32)in[7]) +
  373. ((limb)((s32)in[5])) * ((s32)in[8]) +
  374. ((limb)((s32)in[4])) * ((s32)in[9]));
  375. output[14] = 2 * (((limb)((s32)in[7])) * ((s32)in[7]) +
  376. ((limb)((s32)in[6])) * ((s32)in[8]) +
  377. 2 * ((limb)((s32)in[5])) * ((s32)in[9]));
  378. output[15] = 2 * (((limb)((s32)in[7])) * ((s32)in[8]) +
  379. ((limb)((s32)in[6])) * ((s32)in[9]));
  380. output[16] = ((limb)((s32)in[8])) * ((s32)in[8]) +
  381. 4 * ((limb)((s32)in[7])) * ((s32)in[9]);
  382. output[17] = 2 * ((limb)((s32)in[8])) * ((s32)in[9]);
  383. output[18] = 2 * ((limb)((s32)in[9])) * ((s32)in[9]);
  384. }
  385. static void
  386. fsquare(limb *output, const limb *in)
  387. {
  388. limb t[19];
  389. fsquare_inner(t, in);
  390. freduce_degree(t);
  391. freduce_coefficients(t);
  392. memcpy(output, t, sizeof(limb) * 10);
  393. }
  394. /* Take a little-endian, 32-byte number and expand it into polynomial form */
  395. static void
  396. fexpand(limb *output, const u8 *input)
  397. {
  398. #define F(n, start, shift, mask) \
  399. output[n] = ((((limb)input[start + 0]) | \
  400. ((limb)input[start + 1]) << 8 | \
  401. ((limb)input[start + 2]) << 16 | \
  402. ((limb)input[start + 3]) << 24) >> \
  403. shift) & \
  404. mask;
  405. F(0, 0, 0, 0x3ffffff);
  406. F(1, 3, 2, 0x1ffffff);
  407. F(2, 6, 3, 0x3ffffff);
  408. F(3, 9, 5, 0x1ffffff);
  409. F(4, 12, 6, 0x3ffffff);
  410. F(5, 16, 0, 0x1ffffff);
  411. F(6, 19, 1, 0x3ffffff);
  412. F(7, 22, 3, 0x1ffffff);
  413. F(8, 25, 4, 0x3ffffff);
  414. F(9, 28, 6, 0x3ffffff);
  415. #undef F
  416. }
  417. #if (-32 >> 1) != -16
  418. #error "This code only works when >> does sign-extension on negative numbers"
  419. #endif
  420. /* Take a fully reduced polynomial form number and contract it into a
  421. * little-endian, 32-byte array
  422. */
  423. static void
  424. fcontract(u8 *output, limb *input)
  425. {
  426. int i;
  427. int j;
  428. for (j = 0; j < 2; ++j)
  429. {
  430. for (i = 0; i < 9; ++i)
  431. {
  432. if ((i & 1) == 1)
  433. {
  434. /* This calculation is a time-invariant way to make input[i] positive
  435. by borrowing from the next-larger limb.
  436. */
  437. const s32 mask = (s32)(input[i]) >> 31;
  438. const s32 carry = -(((s32)(input[i]) & mask) >> 25);
  439. input[i] = (s32)(input[i]) + (carry << 25);
  440. input[i + 1] = (s32)(input[i + 1]) - carry;
  441. }
  442. else
  443. {
  444. const s32 mask = (s32)(input[i]) >> 31;
  445. const s32 carry = -(((s32)(input[i]) & mask) >> 26);
  446. input[i] = (s32)(input[i]) + (carry << 26);
  447. input[i + 1] = (s32)(input[i + 1]) - carry;
  448. }
  449. }
  450. {
  451. const s32 mask = (s32)(input[9]) >> 31;
  452. const s32 carry = -(((s32)(input[9]) & mask) >> 25);
  453. input[9] = (s32)(input[9]) + (carry << 25);
  454. input[0] = (s32)(input[0]) - (carry * 19);
  455. }
  456. }
  457. /* The first borrow-propagation pass above ended with every limb
  458. except (possibly) input[0] non-negative.
  459. Since each input limb except input[0] is decreased by at most 1
  460. by a borrow-propagation pass, the second borrow-propagation pass
  461. could only have wrapped around to decrease input[0] again if the
  462. first pass left input[0] negative *and* input[1] through input[9]
  463. were all zero. In that case, input[1] is now 2^25 - 1, and this
  464. last borrow-propagation step will leave input[1] non-negative.
  465. */
  466. {
  467. const s32 mask = (s32)(input[0]) >> 31;
  468. const s32 carry = -(((s32)(input[0]) & mask) >> 26);
  469. input[0] = (s32)(input[0]) + (carry << 26);
  470. input[1] = (s32)(input[1]) - carry;
  471. }
  472. /* Both passes through the above loop, plus the last 0-to-1 step, are
  473. necessary: if input[9] is -1 and input[0] through input[8] are 0,
  474. negative values will remain in the array until the end.
  475. */
  476. input[1] <<= 2;
  477. input[2] <<= 3;
  478. input[3] <<= 5;
  479. input[4] <<= 6;
  480. input[6] <<= 1;
  481. input[7] <<= 3;
  482. input[8] <<= 4;
  483. input[9] <<= 6;
  484. #define F(i, s) \
  485. output[s + 0] |= input[i] & 0xff; \
  486. output[s + 1] = (input[i] >> 8) & 0xff; \
  487. output[s + 2] = (input[i] >> 16) & 0xff; \
  488. output[s + 3] = (input[i] >> 24) & 0xff;
  489. output[0] = 0;
  490. output[16] = 0;
  491. F(0, 0);
  492. F(1, 3);
  493. F(2, 6);
  494. F(3, 9);
  495. F(4, 12);
  496. F(5, 16);
  497. F(6, 19);
  498. F(7, 22);
  499. F(8, 25);
  500. F(9, 28);
  501. #undef F
  502. }
  503. /* Input: Q, Q', Q-Q'
  504. * Output: 2Q, Q+Q'
  505. *
  506. * x2 z3: long form
  507. * x3 z3: long form
  508. * x z: short form, destroyed
  509. * xprime zprime: short form, destroyed
  510. * qmqp: short form, preserved
  511. */
  512. static void fmonty(limb *x2, limb *z2, /* output 2Q */
  513. limb *x3, limb *z3, /* output Q + Q' */
  514. limb *x, limb *z, /* input Q */
  515. limb *xprime, limb *zprime, /* input Q' */
  516. const limb *qmqp /* input Q - Q' */)
  517. {
  518. limb origx[10], origxprime[10], zzz[19], xx[19], zz[19], xxprime[19],
  519. zzprime[19], zzzprime[19], xxxprime[19];
  520. memcpy(origx, x, 10 * sizeof(limb));
  521. fsum(x, z);
  522. fdifference(z, origx); // does x - z
  523. memcpy(origxprime, xprime, sizeof(limb) * 10);
  524. fsum(xprime, zprime);
  525. fdifference(zprime, origxprime);
  526. fproduct(xxprime, xprime, z);
  527. fproduct(zzprime, x, zprime);
  528. freduce_degree(xxprime);
  529. freduce_coefficients(xxprime);
  530. freduce_degree(zzprime);
  531. freduce_coefficients(zzprime);
  532. memcpy(origxprime, xxprime, sizeof(limb) * 10);
  533. fsum(xxprime, zzprime);
  534. fdifference(zzprime, origxprime);
  535. fsquare(xxxprime, xxprime);
  536. fsquare(zzzprime, zzprime);
  537. fproduct(zzprime, zzzprime, qmqp);
  538. freduce_degree(zzprime);
  539. freduce_coefficients(zzprime);
  540. memcpy(x3, xxxprime, sizeof(limb) * 10);
  541. memcpy(z3, zzprime, sizeof(limb) * 10);
  542. fsquare(xx, x);
  543. fsquare(zz, z);
  544. fproduct(x2, xx, zz);
  545. freduce_degree(x2);
  546. freduce_coefficients(x2);
  547. fdifference(zz, xx); // does zz = xx - zz
  548. memset(zzz + 10, 0, sizeof(limb) * 9);
  549. fscalar_product(zzz, zz, 121665);
  550. /* No need to call freduce_degree here:
  551. fscalar_product doesn't increase the degree of its input. */
  552. freduce_coefficients(zzz);
  553. fsum(zzz, xx);
  554. fproduct(z2, zz, zzz);
  555. freduce_degree(z2);
  556. freduce_coefficients(z2);
  557. }
  558. /* Conditionally swap two reduced-form limb arrays if 'iswap' is 1, but leave
  559. * them unchanged if 'iswap' is 0. Runs in data-invariant time to avoid
  560. * side-channel attacks.
  561. *
  562. * NOTE that this function requires that 'iswap' be 1 or 0; other values give
  563. * wrong results. Also, the two limb arrays must be in reduced-coefficient,
  564. * reduced-degree form: the values in a[10..19] or b[10..19] aren't swapped,
  565. * and all all values in a[0..9],b[0..9] must have magnitude less than
  566. * INT32_MAX.
  567. */
  568. static void
  569. swap_conditional(limb a[19], limb b[19], limb iswap)
  570. {
  571. unsigned i;
  572. const s32 swap = (s32)-iswap;
  573. for (i = 0; i < 10; ++i)
  574. {
  575. const s32 x = swap & (((s32)a[i]) ^ ((s32)b[i]));
  576. a[i] = ((s32)a[i]) ^ x;
  577. b[i] = ((s32)b[i]) ^ x;
  578. }
  579. }
  580. /* Calculates nQ where Q is the x-coordinate of a point on the curve
  581. *
  582. * resultx/resultz: the x coordinate of the resulting curve point (short form)
  583. * n: a little endian, 32-byte number
  584. * q: a point of the curve (short form)
  585. */
  586. static void
  587. cmult(limb *resultx, limb *resultz, const u8 *n, const limb *q)
  588. {
  589. limb a[19] = {0}, b[19] = {1}, c[19] = {1}, d[19] = {0};
  590. limb *nqpqx = a, *nqpqz = b, *nqx = c, *nqz = d, *t;
  591. limb e[19] = {0}, f[19] = {1}, g[19] = {0}, h[19] = {1};
  592. limb *nqpqx2 = e, *nqpqz2 = f, *nqx2 = g, *nqz2 = h;
  593. unsigned i, j;
  594. memcpy(nqpqx, q, sizeof(limb) * 10);
  595. for (i = 0; i < 32; ++i)
  596. {
  597. u8 byte = n[31 - i];
  598. for (j = 0; j < 8; ++j)
  599. {
  600. const limb bit = byte >> 7;
  601. swap_conditional(nqx, nqpqx, bit);
  602. swap_conditional(nqz, nqpqz, bit);
  603. fmonty(nqx2, nqz2,
  604. nqpqx2, nqpqz2,
  605. nqx, nqz,
  606. nqpqx, nqpqz,
  607. q);
  608. swap_conditional(nqx2, nqpqx2, bit);
  609. swap_conditional(nqz2, nqpqz2, bit);
  610. t = nqx;
  611. nqx = nqx2;
  612. nqx2 = t;
  613. t = nqz;
  614. nqz = nqz2;
  615. nqz2 = t;
  616. t = nqpqx;
  617. nqpqx = nqpqx2;
  618. nqpqx2 = t;
  619. t = nqpqz;
  620. nqpqz = nqpqz2;
  621. nqpqz2 = t;
  622. byte <<= 1;
  623. }
  624. }
  625. memcpy(resultx, nqx, sizeof(limb) * 10);
  626. memcpy(resultz, nqz, sizeof(limb) * 10);
  627. }
  628. // -----------------------------------------------------------------------------
  629. // Shamelessly copied from djb's code
  630. // -----------------------------------------------------------------------------
  631. static void
  632. crecip(limb *out, const limb *z)
  633. {
  634. limb z2[10];
  635. limb z9[10];
  636. limb z11[10];
  637. limb z2_5_0[10];
  638. limb z2_10_0[10];
  639. limb z2_20_0[10];
  640. limb z2_50_0[10];
  641. limb z2_100_0[10];
  642. limb t0[10];
  643. limb t1[10];
  644. int i;
  645. /* 2 */ fsquare(z2, z);
  646. /* 4 */ fsquare(t1, z2);
  647. /* 8 */ fsquare(t0, t1);
  648. /* 9 */ fmul(z9, t0, z);
  649. /* 11 */ fmul(z11, z9, z2);
  650. /* 22 */ fsquare(t0, z11);
  651. /* 2^5 - 2^0 = 31 */ fmul(z2_5_0, t0, z9);
  652. /* 2^6 - 2^1 */ fsquare(t0, z2_5_0);
  653. /* 2^7 - 2^2 */ fsquare(t1, t0);
  654. /* 2^8 - 2^3 */ fsquare(t0, t1);
  655. /* 2^9 - 2^4 */ fsquare(t1, t0);
  656. /* 2^10 - 2^5 */ fsquare(t0, t1);
  657. /* 2^10 - 2^0 */ fmul(z2_10_0, t0, z2_5_0);
  658. /* 2^11 - 2^1 */ fsquare(t0, z2_10_0);
  659. /* 2^12 - 2^2 */ fsquare(t1, t0);
  660. /* 2^20 - 2^10 */ for (i = 2; i < 10; i += 2)
  661. {
  662. fsquare(t0, t1);
  663. fsquare(t1, t0);
  664. }
  665. /* 2^20 - 2^0 */ fmul(z2_20_0, t1, z2_10_0);
  666. /* 2^21 - 2^1 */ fsquare(t0, z2_20_0);
  667. /* 2^22 - 2^2 */ fsquare(t1, t0);
  668. /* 2^40 - 2^20 */ for (i = 2; i < 20; i += 2)
  669. {
  670. fsquare(t0, t1);
  671. fsquare(t1, t0);
  672. }
  673. /* 2^40 - 2^0 */ fmul(t0, t1, z2_20_0);
  674. /* 2^41 - 2^1 */ fsquare(t1, t0);
  675. /* 2^42 - 2^2 */ fsquare(t0, t1);
  676. /* 2^50 - 2^10 */ for (i = 2; i < 10; i += 2)
  677. {
  678. fsquare(t1, t0);
  679. fsquare(t0, t1);
  680. }
  681. /* 2^50 - 2^0 */ fmul(z2_50_0, t0, z2_10_0);
  682. /* 2^51 - 2^1 */ fsquare(t0, z2_50_0);
  683. /* 2^52 - 2^2 */ fsquare(t1, t0);
  684. /* 2^100 - 2^50 */ for (i = 2; i < 50; i += 2)
  685. {
  686. fsquare(t0, t1);
  687. fsquare(t1, t0);
  688. }
  689. /* 2^100 - 2^0 */ fmul(z2_100_0, t1, z2_50_0);
  690. /* 2^101 - 2^1 */ fsquare(t1, z2_100_0);
  691. /* 2^102 - 2^2 */ fsquare(t0, t1);
  692. /* 2^200 - 2^100 */ for (i = 2; i < 100; i += 2)
  693. {
  694. fsquare(t1, t0);
  695. fsquare(t0, t1);
  696. }
  697. /* 2^200 - 2^0 */ fmul(t1, t0, z2_100_0);
  698. /* 2^201 - 2^1 */ fsquare(t0, t1);
  699. /* 2^202 - 2^2 */ fsquare(t1, t0);
  700. /* 2^250 - 2^50 */ for (i = 2; i < 50; i += 2)
  701. {
  702. fsquare(t0, t1);
  703. fsquare(t1, t0);
  704. }
  705. /* 2^250 - 2^0 */ fmul(t0, t1, z2_50_0);
  706. /* 2^251 - 2^1 */ fsquare(t1, t0);
  707. /* 2^252 - 2^2 */ fsquare(t0, t1);
  708. /* 2^253 - 2^3 */ fsquare(t1, t0);
  709. /* 2^254 - 2^4 */ fsquare(t0, t1);
  710. /* 2^255 - 2^5 */ fsquare(t1, t0);
  711. /* 2^255 - 21 */ fmul(out, t1, z11);
  712. }
  713. int curve25519_donna(u8 *, const u8 *, const u8 *);
  714. int curve25519_donna(u8 *mypublic, const u8 *secret, const u8 *basepoint)
  715. {
  716. limb bp[10], x[10], z[11], zmone[10];
  717. uint8_t e[32];
  718. int i;
  719. for (i = 0; i < 32; ++i)
  720. e[i] = secret[i];
  721. e[0] &= 248;
  722. e[31] &= 127;
  723. e[31] |= 64;
  724. fexpand(bp, basepoint);
  725. cmult(x, z, e, bp);
  726. crecip(zmone, z);
  727. fmul(z, x, zmone);
  728. freduce_coefficients(z);
  729. fcontract(mypublic, z);
  730. return 0;
  731. }
  732. /// returns 0 for '=' or unrecognized characters, not a problem since PEM is well constrained.
  733. static int base64_value(int c)
  734. {
  735. if (c >= 'A' && c <= 'Z')
  736. return c - 'A';
  737. if (c >= 'a' && c <= 'z')
  738. return 26 + c - 'a';
  739. if (c >= '0' && c <= '9')
  740. return 52 + c - '0';
  741. if (c == '+')
  742. return 62;
  743. if (c == '/')
  744. return 63;
  745. return 0x1000;
  746. }
  747. /**
  748. * @param[in] data the base64 encoded string
  749. * @param[out] data the decoded result
  750. * @param[in] len the length of base64 encoded data
  751. * @param[out] len the length of decoded result
  752. */
  753. static void base64_decode(u8 *data, int *len)
  754. {
  755. int read = 0;
  756. int write = 0;
  757. int state[4];
  758. while (read < *len)
  759. {
  760. state[read % 4] = base64_value(data[read]);
  761. if (state[read % 4] == 0x1000)
  762. {
  763. break;
  764. }
  765. if ((read % 4) == 3)
  766. {
  767. data[write++] = state[0] << 2 | state[1] >> 4;
  768. data[write++] = state[1] << 4 | state[2] >> 2;
  769. data[write++] = state[2] << 6 | state[3] >> 0;
  770. }
  771. read++;
  772. }
  773. switch (read % 4)
  774. {
  775. case 2:
  776. data[write++] = state[0] << 2 | state[1] >> 4;
  777. break;
  778. case 3:
  779. data[write++] = state[0] << 2 | state[1] >> 4;
  780. data[write++] = state[1] << 4 | state[2] >> 2;
  781. }
  782. *len = write;
  783. }
  784. /**
  785. * reads the 32-byte key from a PEM file, takes advantage of the
  786. * fact that the last 32 bytes of encoded DER data are the key in
  787. * both the private and public key forms.
  788. */
  789. void read_key(const char *filename, u8 *key)
  790. {
  791. FILE *f = fopen(filename, "r");
  792. if (!f)
  793. {
  794. fprintf(stderr, "Unable to open %s: %s\n", filename, strerror(errno));
  795. exit(1);
  796. }
  797. char line[512] = {};
  798. fgets(line, sizeof(line), f);
  799. if (strncmp(line, "-----BEGIN ", sizeof("-----BEGIN ") - 1) != 0)
  800. {
  801. fprintf(stderr, "File %s is not a PEM file\n", filename);
  802. exit(1);
  803. }
  804. fgets(line, sizeof(line), f);
  805. line[strcspn(line, "\r\n")] = '\0';
  806. int len = strlen(line);
  807. base64_decode((u8 *)line, &len);
  808. if (len < 32)
  809. {
  810. fprintf(stderr, "Short read from %s\n", filename);
  811. exit(1);
  812. }
  813. memcpy(key, line + (len - 32), 32);
  814. fclose(f);
  815. return;
  816. }
  817. int main(int argc, char **argv)
  818. {
  819. u8 privkey[32];
  820. u8 pubkey[32];
  821. u8 result[32];
  822. if (argc != 3)
  823. {
  824. fprintf(stderr, "Usage: %s [privkey] [pubkey]\n", argv[0]);
  825. exit(1);
  826. }
  827. read_key(argv[1], privkey);
  828. read_key(argv[2], pubkey);
  829. curve25519_donna(result, privkey, pubkey);
  830. // fwrite(result, 32, 1, stdout);
  831. for (int i = 0; i < 32; i++)
  832. {
  833. printf("%d ", result[i]);
  834. }
  835. exit(0);
  836. }