tsne.cpp 26 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724
  1. #include "pch.h"
  2. /*
  3. *
  4. * Copyright (c) 2014, Laurens van der Maaten (Delft University of Technology)
  5. * All rights reserved.
  6. *
  7. * Redistribution and use in source and binary forms, with or without
  8. * modification, are permitted provided that the following conditions are met:
  9. * 1. Redistributions of source code must retain the above copyright
  10. * notice, this list of conditions and the following disclaimer.
  11. * 2. Redistributions in binary form must reproduce the above copyright
  12. * notice, this list of conditions and the following disclaimer in the
  13. * documentation and/or other materials provided with the distribution.
  14. * 3. All advertising materials mentioning features or use of this software
  15. * must display the following acknowledgement:
  16. * This product includes software developed by the Delft University of Technology.
  17. * 4. Neither the name of the Delft University of Technology nor the names of
  18. * its contributors may be used to endorse or promote products derived from
  19. * this software without specific prior written permission.
  20. *
  21. * THIS SOFTWARE IS PROVIDED BY LAURENS VAN DER MAATEN ''AS IS'' AND ANY EXPRESS
  22. * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  23. * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
  24. * EVENT SHALL LAURENS VAN DER MAATEN BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  25. * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
  26. * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
  27. * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  28. * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
  29. * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY
  30. * OF SUCH DAMAGE.
  31. *
  32. */
  33. #include <cfloat>
  34. #include <cmath>
  35. #include <cstdlib>
  36. #include <cstdio>
  37. #include <cstring>
  38. #include <ctime>
  39. #include "sptree.h"
  40. #include "tsne.h"
  41. #include "vptree.h"
  42. #pragma warning(disable:4996)
  43. using namespace std;
  44. static double sign(double x) { return (x == .0 ? .0 : (x < .0 ? -1.0 : 1.0)); }
  45. static void zeroMean(double* X, int N, int D);
  46. static void computeGaussianPerplexity(double* X, int N, int D, double* P, double perplexity);
  47. static void computeGaussianPerplexity(double* X, int N, int D, unsigned int** _row_P, unsigned int** _col_P, double** _val_P, double perplexity, int K);
  48. static double randn();
  49. static void computeExactGradient(double* P, double* Y, int N, int D, double* dC);
  50. static void computeGradient(unsigned int* inp_row_P, unsigned int* inp_col_P, double* inp_val_P, double* Y, int N, int D, double* dC, double theta);
  51. static double evaluateError(double* P, double* Y, int N, int D);
  52. static double evaluateError(unsigned int* row_P, unsigned int* col_P, double* val_P, double* Y, int N, int D, double theta);
  53. static void computeSquaredEuclideanDistance(double* X, int N, int D, double* DD);
  54. static void symmetrizeMatrix(unsigned int** row_P, unsigned int** col_P, double** val_P, int N);
  55. // Perform t-SNE
  56. void TSNE::run(double* X, int N, int D, double* Y, int no_dims, double perplexity, double theta, double eta, int rand_seed,
  57. bool skip_random_init, int max_iter, int stop_lying_iter, int mom_switch_iter) {
  58. // Set random seed
  59. if (skip_random_init != true) {
  60. if(rand_seed >= 0) {
  61. printf("Using random seed: %d\n", rand_seed);
  62. srand((unsigned int) rand_seed);
  63. } else {
  64. printf("Using current time as random seed...\n");
  65. srand(time(NULL));
  66. }
  67. }
  68. // Determine whether we are using an exact algorithm
  69. if(N - 1 < 3 * perplexity) { printf("Perplexity too large for the number of data points!\n"); exit(1); }
  70. printf("Using no_dims = %d, perplexity = %f, and theta = %f\n", no_dims, perplexity, theta);
  71. bool exact = (theta == .0) ? true : false;
  72. // Set learning parameters
  73. float total_time = .0;
  74. clock_t start, end;
  75. double momentum = .5, final_momentum = .8;
  76. // Allocate some memory
  77. double* dY = (double*) malloc(N * no_dims * sizeof(double));
  78. double* uY = (double*) malloc(N * no_dims * sizeof(double));
  79. double* gains = (double*) malloc(N * no_dims * sizeof(double));
  80. if(dY == NULL || uY == NULL || gains == NULL) { printf("Memory allocation failed!\n"); exit(1); }
  81. for(int i = 0; i < N * no_dims; i++) uY[i] = .0;
  82. for(int i = 0; i < N * no_dims; i++) gains[i] = 1.0;
  83. // Normalize input data (to prevent numerical problems)
  84. printf("Computing input similarities...\n");
  85. start = clock();
  86. zeroMean(X, N, D);
  87. double max_X = .0;
  88. for(int i = 0; i < N * D; i++) {
  89. if(fabs(X[i]) > max_X) max_X = fabs(X[i]);
  90. }
  91. for(int i = 0; i < N * D; i++) X[i] /= max_X;
  92. // Compute input similarities for exact t-SNE
  93. double* P = nullptr; unsigned int* row_P = nullptr; unsigned int* col_P = nullptr; double* val_P = nullptr;
  94. if(exact) {
  95. // Compute similarities
  96. printf("Exact?");
  97. P = (double*) malloc(N * N * sizeof(double));
  98. if(P == NULL) { printf("Memory allocation failed!\n"); exit(1); }
  99. computeGaussianPerplexity(X, N, D, P, perplexity);
  100. // Symmetrize input similarities
  101. printf("Symmetrizing...\n");
  102. int nN = 0;
  103. for(int n = 0; n < N; n++) {
  104. int mN = (n + 1) * N;
  105. for(int m = n + 1; m < N; m++) {
  106. P[nN + m] += P[mN + n];
  107. P[mN + n] = P[nN + m];
  108. mN += N;
  109. }
  110. nN += N;
  111. }
  112. double sum_P = .0;
  113. for(int i = 0; i < N * N; i++) sum_P += P[i];
  114. for(int i = 0; i < N * N; i++) P[i] /= sum_P;
  115. }
  116. // Compute input similarities for approximate t-SNE
  117. else {
  118. // Compute asymmetric pairwise input similarities
  119. computeGaussianPerplexity(X, N, D, &row_P, &col_P, &val_P, perplexity, (int) (3 * perplexity));
  120. // Symmetrize input similarities
  121. symmetrizeMatrix(&row_P, &col_P, &val_P, N);
  122. double sum_P = .0;
  123. for(int i = 0; i < row_P[N]; i++) sum_P += val_P[i];
  124. for(int i = 0; i < row_P[N]; i++) val_P[i] /= sum_P;
  125. }
  126. end = clock();
  127. // Lie about the P-values
  128. if(exact) { for(int i = 0; i < N * N; i++) P[i] *= 12.0; }
  129. else { for(int i = 0; i < row_P[N]; i++) val_P[i] *= 12.0; }
  130. // Initialize solution (randomly)
  131. if (skip_random_init != true) {
  132. for(int i = 0; i < N * no_dims; i++) Y[i] = randn() * .0001;
  133. }
  134. // Perform main training loop
  135. if(exact) printf("Input similarities computed in %4.2f seconds!\nLearning embedding...\n", (float) (end - start) / CLOCKS_PER_SEC);
  136. else printf("Input similarities computed in %4.2f seconds (sparsity = %f)!\nLearning embedding...\n", (float) (end - start) / CLOCKS_PER_SEC, (double) row_P[N] / ((double) N * (double) N));
  137. start = clock();
  138. double last_C = -1;
  139. for(int iter = 0; iter < max_iter; iter++) {
  140. // Compute (approximate) gradient
  141. if(exact) computeExactGradient(P, Y, N, no_dims, dY);
  142. else computeGradient(row_P, col_P, val_P, Y, N, no_dims, dY, theta);
  143. // Update gains
  144. for(int i = 0; i < N * no_dims; i++) gains[i] = (sign(dY[i]) != sign(uY[i])) ? (gains[i] + .2) : (gains[i] * .8);
  145. for(int i = 0; i < N * no_dims; i++) if(gains[i] < .01) gains[i] = .01;
  146. // Perform gradient update (with momentum and gains)
  147. for(int i = 0; i < N * no_dims; i++) uY[i] = momentum * uY[i] - eta * gains[i] * dY[i];
  148. for(int i = 0; i < N * no_dims; i++) Y[i] = Y[i] + uY[i];
  149. // Make solution zero-mean
  150. zeroMean(Y, N, no_dims);
  151. // Stop lying about the P-values after a while, and switch momentum
  152. if(iter == stop_lying_iter) {
  153. if(exact) { for(int i = 0; i < N * N; i++) P[i] /= 12.0; }
  154. else { for(int i = 0; i < row_P[N]; i++) val_P[i] /= 12.0; }
  155. }
  156. if(iter == mom_switch_iter) momentum = final_momentum;
  157. // Print out progress
  158. if (iter > 0 && (iter % 50 == 0 || iter == max_iter - 1)) {
  159. end = clock();
  160. double C = .0;
  161. if(exact) C = evaluateError(P, Y, N, no_dims);
  162. else C = evaluateError(row_P, col_P, val_P, Y, N, no_dims, theta); // doing approximate computation here!
  163. if(iter == 0)
  164. printf("Iteration %d: error is %f\n", iter + 1, C);
  165. else {
  166. total_time += (float) (end - start) / CLOCKS_PER_SEC;
  167. printf("Iteration %d: error is %f (50 iterations in %4.2f seconds)\n", iter, C, (float) (end - start) / CLOCKS_PER_SEC);
  168. }
  169. start = clock();
  170. /*if (std::fabs(last_C - C) < 0.001) {
  171. break;
  172. }
  173. last_C = C;*/
  174. }
  175. }
  176. end = clock(); total_time += (float) (end - start) / CLOCKS_PER_SEC;
  177. // Clean up memory
  178. free(dY);
  179. free(uY);
  180. free(gains);
  181. if(exact) free(P);
  182. else {
  183. free(row_P); row_P = NULL;
  184. free(col_P); col_P = NULL;
  185. free(val_P); val_P = NULL;
  186. }
  187. printf("Fitting performed in %4.2f seconds.\n", total_time);
  188. }
  189. // Compute gradient of the t-SNE cost function (using Barnes-Hut algorithm)
  190. static void computeGradient(unsigned int* inp_row_P, unsigned int* inp_col_P, double* inp_val_P, double* Y, int N, int D, double* dC, double theta)
  191. {
  192. // Construct space-partitioning tree on current map
  193. SPTree* tree = new SPTree(D, Y, N);
  194. // Compute all terms required for t-SNE gradient
  195. double sum_Q = .0;
  196. double* pos_f = (double*) calloc(N * D, sizeof(double));
  197. double* neg_f = (double*) calloc(N * D, sizeof(double));
  198. if(pos_f == NULL || neg_f == NULL) { printf("Memory allocation failed!\n"); exit(1); }
  199. tree->computeEdgeForces(inp_row_P, inp_col_P, inp_val_P, N, pos_f);
  200. for(int n = 0; n < N; n++) tree->computeNonEdgeForces(n, theta, neg_f + n * D, &sum_Q);
  201. // Compute final t-SNE gradient
  202. for(int i = 0; i < N * D; i++) {
  203. dC[i] = pos_f[i] - (neg_f[i] / sum_Q);
  204. }
  205. free(pos_f);
  206. free(neg_f);
  207. delete tree;
  208. }
  209. // Compute gradient of the t-SNE cost function (exact)
  210. static void computeExactGradient(double* P, double* Y, int N, int D, double* dC) {
  211. // Make sure the current gradient contains zeros
  212. for(int i = 0; i < N * D; i++) dC[i] = 0.0;
  213. // Compute the squared Euclidean distance matrix
  214. double* DD = (double*) malloc(N * N * sizeof(double));
  215. if(DD == NULL) { printf("Memory allocation failed!\n"); exit(1); }
  216. computeSquaredEuclideanDistance(Y, N, D, DD);
  217. // Compute Q-matrix and normalization sum
  218. double* Q = (double*) malloc(N * N * sizeof(double));
  219. if(Q == NULL) { printf("Memory allocation failed!\n"); exit(1); }
  220. double sum_Q = .0;
  221. int nN = 0;
  222. for(int n = 0; n < N; n++) {
  223. for(int m = 0; m < N; m++) {
  224. if(n != m) {
  225. Q[nN + m] = 1 / (1 + DD[nN + m]);
  226. sum_Q += Q[nN + m];
  227. }
  228. }
  229. nN += N;
  230. }
  231. // Perform the computation of the gradient
  232. nN = 0;
  233. int nD = 0;
  234. for(int n = 0; n < N; n++) {
  235. int mD = 0;
  236. for(int m = 0; m < N; m++) {
  237. if(n != m) {
  238. double mult = (P[nN + m] - (Q[nN + m] / sum_Q)) * Q[nN + m];
  239. for(int d = 0; d < D; d++) {
  240. dC[nD + d] += (Y[nD + d] - Y[mD + d]) * mult;
  241. }
  242. }
  243. mD += D;
  244. }
  245. nN += N;
  246. nD += D;
  247. }
  248. // Free memory
  249. free(DD); DD = NULL;
  250. free(Q); Q = NULL;
  251. }
  252. // Evaluate t-SNE cost function (exactly)
  253. static double evaluateError(double* P, double* Y, int N, int D) {
  254. // Compute the squared Euclidean distance matrix
  255. double* DD = (double*) malloc(N * N * sizeof(double));
  256. double* Q = (double*) malloc(N * N * sizeof(double));
  257. if(DD == NULL || Q == NULL) { printf("Memory allocation failed!\n"); exit(1); }
  258. computeSquaredEuclideanDistance(Y, N, D, DD);
  259. // Compute Q-matrix and normalization sum
  260. int nN = 0;
  261. double sum_Q = DBL_MIN;
  262. for(int n = 0; n < N; n++) {
  263. for(int m = 0; m < N; m++) {
  264. if(n != m) {
  265. Q[nN + m] = 1 / (1 + DD[nN + m]);
  266. sum_Q += Q[nN + m];
  267. }
  268. else Q[nN + m] = DBL_MIN;
  269. }
  270. nN += N;
  271. }
  272. for(int i = 0; i < N * N; i++) Q[i] /= sum_Q;
  273. // Sum t-SNE error
  274. double C = .0;
  275. for(int n = 0; n < N * N; n++) {
  276. C += P[n] * log((P[n] + FLT_MIN) / (Q[n] + FLT_MIN));
  277. }
  278. // Clean up memory
  279. free(DD);
  280. free(Q);
  281. return C;
  282. }
  283. // Evaluate t-SNE cost function (approximately)
  284. static double evaluateError(unsigned int* row_P, unsigned int* col_P, double* val_P, double* Y, int N, int D, double theta)
  285. {
  286. // Get estimate of normalization term
  287. SPTree* tree = new SPTree(D, Y, N);
  288. double* buff = (double*) calloc(D, sizeof(double));
  289. double sum_Q = .0;
  290. for(int n = 0; n < N; n++) tree->computeNonEdgeForces(n, theta, buff, &sum_Q);
  291. // Loop over all edges to compute t-SNE error
  292. int ind1, ind2;
  293. double C = .0, Q;
  294. for(int n = 0; n < N; n++) {
  295. ind1 = n * D;
  296. for(int i = row_P[n]; i < row_P[n + 1]; i++) {
  297. Q = .0;
  298. ind2 = col_P[i] * D;
  299. for(int d = 0; d < D; d++) buff[d] = Y[ind1 + d];
  300. for(int d = 0; d < D; d++) buff[d] -= Y[ind2 + d];
  301. for(int d = 0; d < D; d++) Q += buff[d] * buff[d];
  302. Q = (1.0 / (1.0 + Q)) / sum_Q;
  303. C += val_P[i] * log((val_P[i] + FLT_MIN) / (Q + FLT_MIN));
  304. }
  305. }
  306. // Clean up memory
  307. free(buff);
  308. delete tree;
  309. return C;
  310. }
  311. // Compute input similarities with a fixed perplexity
  312. static void computeGaussianPerplexity(double* X, int N, int D, double* P, double perplexity) {
  313. // Compute the squared Euclidean distance matrix
  314. double* DD = (double*) malloc(N * N * sizeof(double));
  315. if(DD == NULL) { printf("Memory allocation failed!\n"); exit(1); }
  316. computeSquaredEuclideanDistance(X, N, D, DD);
  317. // Compute the Gaussian kernel row by row
  318. int nN = 0;
  319. for(int n = 0; n < N; n++) {
  320. // Initialize some variables
  321. bool found = false;
  322. double beta = 1.0;
  323. double min_beta = -DBL_MAX;
  324. double max_beta = DBL_MAX;
  325. double tol = 1e-5;
  326. double sum_P;
  327. // Iterate until we found a good perplexity
  328. int iter = 0;
  329. while(!found && iter < 200) {
  330. // Compute Gaussian kernel row
  331. for(int m = 0; m < N; m++) P[nN + m] = exp(-beta * DD[nN + m]);
  332. P[nN + n] = DBL_MIN;
  333. // Compute entropy of current row
  334. sum_P = DBL_MIN;
  335. for(int m = 0; m < N; m++) sum_P += P[nN + m];
  336. double H = 0.0;
  337. for(int m = 0; m < N; m++) H += beta * (DD[nN + m] * P[nN + m]);
  338. H = (H / sum_P) + log(sum_P);
  339. // Evaluate whether the entropy is within the tolerance level
  340. double Hdiff = H - log(perplexity);
  341. if(Hdiff < tol && -Hdiff < tol) {
  342. found = true;
  343. }
  344. else {
  345. if(Hdiff > 0) {
  346. min_beta = beta;
  347. if(max_beta == DBL_MAX || max_beta == -DBL_MAX)
  348. beta *= 2.0;
  349. else
  350. beta = (beta + max_beta) / 2.0;
  351. }
  352. else {
  353. max_beta = beta;
  354. if(min_beta == -DBL_MAX || min_beta == DBL_MAX)
  355. beta /= 2.0;
  356. else
  357. beta = (beta + min_beta) / 2.0;
  358. }
  359. }
  360. // Update iteration counter
  361. iter++;
  362. }
  363. // Row normalize P
  364. for(int m = 0; m < N; m++) P[nN + m] /= sum_P;
  365. nN += N;
  366. }
  367. // Clean up memory
  368. free(DD); DD = NULL;
  369. }
  370. // Compute input similarities with a fixed perplexity using ball trees (this function allocates memory another function should free)
  371. static void computeGaussianPerplexity(double* X, int N, int D, unsigned int** _row_P, unsigned int** _col_P, double** _val_P, double perplexity, int K) {
  372. if(perplexity > K) printf("Perplexity should be lower than K!\n");
  373. // Allocate the memory we need
  374. *_row_P = (unsigned int*) malloc((N + 1) * sizeof(unsigned int));
  375. *_col_P = (unsigned int*) calloc(N * K, sizeof(unsigned int));
  376. *_val_P = (double*) calloc(N * K, sizeof(double));
  377. if(*_row_P == NULL || *_col_P == NULL || *_val_P == NULL) { printf("Memory allocation failed!\n"); exit(1); }
  378. unsigned int* row_P = *_row_P;
  379. unsigned int* col_P = *_col_P;
  380. double* val_P = *_val_P;
  381. double* cur_P = (double*) malloc((N - 1) * sizeof(double));
  382. if(cur_P == NULL) { printf("Memory allocation failed!\n"); exit(1); }
  383. row_P[0] = 0;
  384. for(int n = 0; n < N; n++) row_P[n + 1] = row_P[n] + (unsigned int) K;
  385. // Build ball tree on data set
  386. VpTree<DataPoint, euclidean_distance>* tree = new VpTree<DataPoint, euclidean_distance>();
  387. vector<DataPoint> obj_X(N, DataPoint(D, -1, X));
  388. for(int n = 0; n < N; n++) obj_X[n] = DataPoint(D, n, X + n * D);
  389. tree->create(obj_X);
  390. // Loop over all points to find nearest neighbors
  391. printf("Building tree...\n");
  392. vector<DataPoint> indices;
  393. vector<double> distances;
  394. for(int n = 0; n < N; n++) {
  395. if(n % 10000 == 0) printf(" - point %d of %d\n", n, N);
  396. // Find nearest neighbors
  397. indices.clear();
  398. distances.clear();
  399. tree->search(obj_X[n], K + 1, &indices, &distances);
  400. // Initialize some variables for binary search
  401. bool found = false;
  402. double beta = 1.0;
  403. double min_beta = -DBL_MAX;
  404. double max_beta = DBL_MAX;
  405. double tol = 1e-5;
  406. // Iterate until we found a good perplexity
  407. int iter = 0; double sum_P;
  408. while(!found && iter < 200) {
  409. // Compute Gaussian kernel row
  410. for(int m = 0; m < K; m++) cur_P[m] = exp(-beta * distances[m + 1] * distances[m + 1]);
  411. // Compute entropy of current row
  412. sum_P = DBL_MIN;
  413. for(int m = 0; m < K; m++) sum_P += cur_P[m];
  414. double H = .0;
  415. for(int m = 0; m < K; m++) H += beta * (distances[m + 1] * distances[m + 1] * cur_P[m]);
  416. H = (H / sum_P) + log(sum_P);
  417. // Evaluate whether the entropy is within the tolerance level
  418. double Hdiff = H - log(perplexity);
  419. if(Hdiff < tol && -Hdiff < tol) {
  420. found = true;
  421. }
  422. else {
  423. if(Hdiff > 0) {
  424. min_beta = beta;
  425. if(max_beta == DBL_MAX || max_beta == -DBL_MAX)
  426. beta *= 2.0;
  427. else
  428. beta = (beta + max_beta) / 2.0;
  429. }
  430. else {
  431. max_beta = beta;
  432. if(min_beta == -DBL_MAX || min_beta == DBL_MAX)
  433. beta /= 2.0;
  434. else
  435. beta = (beta + min_beta) / 2.0;
  436. }
  437. }
  438. // Update iteration counter
  439. iter++;
  440. }
  441. // Row-normalize current row of P and store in matrix
  442. for(unsigned int m = 0; m < K; m++) cur_P[m] /= sum_P;
  443. for(unsigned int m = 0; m < K; m++) {
  444. col_P[row_P[n] + m] = (unsigned int) indices[m + 1].index();
  445. val_P[row_P[n] + m] = cur_P[m];
  446. }
  447. }
  448. // Clean up memory
  449. obj_X.clear();
  450. free(cur_P);
  451. delete tree;
  452. }
  453. // Symmetrizes a sparse matrix
  454. static void symmetrizeMatrix(unsigned int** _row_P, unsigned int** _col_P, double** _val_P, int N) {
  455. // Get sparse matrix
  456. unsigned int* row_P = *_row_P;
  457. unsigned int* col_P = *_col_P;
  458. double* val_P = *_val_P;
  459. // Count number of elements and row counts of symmetric matrix
  460. int* row_counts = (int*) calloc(N, sizeof(int));
  461. if(row_counts == NULL) { printf("Memory allocation failed!\n"); exit(1); }
  462. for(int n = 0; n < N; n++) {
  463. for(int i = row_P[n]; i < row_P[n + 1]; i++) {
  464. // Check whether element (col_P[i], n) is present
  465. bool present = false;
  466. for(int m = row_P[col_P[i]]; m < row_P[col_P[i] + 1]; m++) {
  467. if(col_P[m] == n) present = true;
  468. }
  469. if(present) row_counts[n]++;
  470. else {
  471. row_counts[n]++;
  472. row_counts[col_P[i]]++;
  473. }
  474. }
  475. }
  476. int no_elem = 0;
  477. for(int n = 0; n < N; n++) no_elem += row_counts[n];
  478. // Allocate memory for symmetrized matrix
  479. unsigned int* sym_row_P = (unsigned int*) malloc((N + 1) * sizeof(unsigned int));
  480. unsigned int* sym_col_P = (unsigned int*) malloc(no_elem * sizeof(unsigned int));
  481. double* sym_val_P = (double*) malloc(no_elem * sizeof(double));
  482. if(sym_row_P == NULL || sym_col_P == NULL || sym_val_P == NULL) { printf("Memory allocation failed!\n"); exit(1); }
  483. // Construct new row indices for symmetric matrix
  484. sym_row_P[0] = 0;
  485. for(int n = 0; n < N; n++) sym_row_P[n + 1] = sym_row_P[n] + (unsigned int) row_counts[n];
  486. // Fill the result matrix
  487. int* offset = (int*) calloc(N, sizeof(int));
  488. if(offset == NULL) { printf("Memory allocation failed!\n"); exit(1); }
  489. for(int n = 0; n < N; n++) {
  490. for(unsigned int i = row_P[n]; i < row_P[n + 1]; i++) { // considering element(n, col_P[i])
  491. // Check whether element (col_P[i], n) is present
  492. bool present = false;
  493. for(unsigned int m = row_P[col_P[i]]; m < row_P[col_P[i] + 1]; m++) {
  494. if(col_P[m] == n) {
  495. present = true;
  496. if(n <= col_P[i]) { // make sure we do not add elements twice
  497. sym_col_P[sym_row_P[n] + offset[n]] = col_P[i];
  498. sym_col_P[sym_row_P[col_P[i]] + offset[col_P[i]]] = n;
  499. sym_val_P[sym_row_P[n] + offset[n]] = val_P[i] + val_P[m];
  500. sym_val_P[sym_row_P[col_P[i]] + offset[col_P[i]]] = val_P[i] + val_P[m];
  501. }
  502. }
  503. }
  504. // If (col_P[i], n) is not present, there is no addition involved
  505. if(!present) {
  506. sym_col_P[sym_row_P[n] + offset[n]] = col_P[i];
  507. sym_col_P[sym_row_P[col_P[i]] + offset[col_P[i]]] = n;
  508. sym_val_P[sym_row_P[n] + offset[n]] = val_P[i];
  509. sym_val_P[sym_row_P[col_P[i]] + offset[col_P[i]]] = val_P[i];
  510. }
  511. // Update offsets
  512. if(!present || (present && n <= col_P[i])) {
  513. offset[n]++;
  514. if(col_P[i] != n) offset[col_P[i]]++;
  515. }
  516. }
  517. }
  518. // Divide the result by two
  519. for(int i = 0; i < no_elem; i++) sym_val_P[i] /= 2.0;
  520. // Return symmetrized matrices
  521. free(*_row_P); *_row_P = sym_row_P;
  522. free(*_col_P); *_col_P = sym_col_P;
  523. free(*_val_P); *_val_P = sym_val_P;
  524. // Free up some memery
  525. free(offset); offset = NULL;
  526. free(row_counts); row_counts = NULL;
  527. }
  528. // Compute squared Euclidean distance matrix
  529. static void computeSquaredEuclideanDistance(double* X, int N, int D, double* DD) {
  530. const double* XnD = X;
  531. for(int n = 0; n < N; ++n, XnD += D) {
  532. const double* XmD = XnD + D;
  533. double* curr_elem = &DD[n*N + n];
  534. *curr_elem = 0.0;
  535. double* curr_elem_sym = curr_elem + N;
  536. for(int m = n + 1; m < N; ++m, XmD+=D, curr_elem_sym+=N) {
  537. *(++curr_elem) = 0.0;
  538. for(int d = 0; d < D; ++d) {
  539. *curr_elem += (XnD[d] - XmD[d]) * (XnD[d] - XmD[d]);
  540. }
  541. *curr_elem_sym = *curr_elem;
  542. }
  543. }
  544. }
  545. // Makes data zero-mean
  546. static void zeroMean(double* X, int N, int D) {
  547. // Compute data mean
  548. double* mean = (double*) calloc(D, sizeof(double));
  549. if(mean == NULL) { printf("Memory allocation failed!\n"); exit(1); }
  550. int nD = 0;
  551. for(int n = 0; n < N; n++) {
  552. for(int d = 0; d < D; d++) {
  553. mean[d] += X[nD + d];
  554. }
  555. nD += D;
  556. }
  557. for(int d = 0; d < D; d++) {
  558. mean[d] /= (double) N;
  559. }
  560. // Subtract data mean
  561. nD = 0;
  562. for(int n = 0; n < N; n++) {
  563. for(int d = 0; d < D; d++) {
  564. X[nD + d] -= mean[d];
  565. }
  566. nD += D;
  567. }
  568. free(mean); mean = NULL;
  569. }
  570. // Generates a Gaussian random number
  571. static double randn() {
  572. double x, y, radius;
  573. do {
  574. x = 2 * (rand() / ((double) RAND_MAX + 1)) - 1;
  575. y = 2 * (rand() / ((double) RAND_MAX + 1)) - 1;
  576. radius = (x * x) + (y * y);
  577. } while((radius >= 1.0) || (radius == 0.0));
  578. radius = sqrt(-2 * log(radius) / radius);
  579. x *= radius;
  580. y *= radius;
  581. return x;
  582. }
  583. // Function that loads data from a t-SNE file
  584. // Note: this function does a malloc that should be freed elsewhere
  585. bool TSNE::load_data(double** data, int* n, int* d, int* no_dims, double* theta, double* perplexity, int* rand_seed, int* max_iter) {
  586. // Open file, read first 2 integers, allocate memory, and read the data
  587. FILE *h;
  588. if((h = fopen("data.dat", "r+b")) == NULL) {
  589. printf("Error: could not open data file.\n");
  590. return false;
  591. }
  592. fread(n, sizeof(int), 1, h); // number of datapoints
  593. fread(d, sizeof(int), 1, h); // original dimensionality
  594. fread(theta, sizeof(double), 1, h); // gradient accuracy
  595. fread(perplexity, sizeof(double), 1, h); // perplexity
  596. fread(no_dims, sizeof(int), 1, h); // output dimensionality
  597. fread(max_iter, sizeof(int),1,h); // maximum number of iterations
  598. *data = (double*) malloc(*d * *n * sizeof(double));
  599. if(*data == NULL) { printf("Memory allocation failed!\n"); exit(1); }
  600. fread(*data, sizeof(double), *n * *d, h); // the data
  601. if(!feof(h)) fread(rand_seed, sizeof(int), 1, h); // random seed
  602. fclose(h);
  603. printf("Read the %i x %i data matrix successfully!\n", *n, *d);
  604. return true;
  605. }
  606. // Function that saves map to a t-SNE file
  607. void TSNE::save_data(double* data, int* landmarks, double* costs, int n, int d) {
  608. // Open file, write first 2 integers and then the data
  609. FILE *h;
  610. if((h = fopen("result.dat", "w+b")) == NULL) {
  611. printf("Error: could not open data file.\n");
  612. return;
  613. }
  614. fwrite(&n, sizeof(int), 1, h);
  615. fwrite(&d, sizeof(int), 1, h);
  616. fwrite(data, sizeof(double), n * d, h);
  617. fwrite(landmarks, sizeof(int), n, h);
  618. fwrite(costs, sizeof(double), n, h);
  619. fclose(h);
  620. printf("Wrote the %i x %i data matrix successfully!\n", n, d);
  621. }