/* * cranksum.c: 2-sample Rank-Sum Test for clustered data * * This code implements a rank-sum test for clustered data [1] for * two-sample cases only. Two simple functions are provided for * the test: * * ========================================================================== * struct rank_sum_result *clustered_rank_sum(const double *X, * const int *group, const int *cluster, * size_t length) * * ARGUMENTS: * * runs a rank sum test for given data. * X: continuous value for each sample points * group: test group that assigned each data point to a group. * must be either 0 or 1. * cluster: associates each data point to a cluster. Cluster * numbers must be integers starting from 0. There must * not be any empty cluster between two adjacent clusters. * For example: {0, 1, 1, 0, 2, 1, 0, 1} is valid. * but, {0, 1, 3, 1, 0, 1, 3} is invalid because no * datapoint is assigned to cluster #2. * * RETURNS: * * returns a newly-allocated pointer to an array of four double * precision floating point numbers in order of S, E(S), Var(S) and * Z. You must deallocate with free_clustered_rank_sum_result * when you're done using the result. * * * ========================================================================== * void free_clustered_rank_sum_result(struct rank_sum_result *result) * * ARGUMENTS: * * pointer of the result to deallocate. * * * Feel free to write to Hyeshik Chang if you have * suggestions or questions for this program. * * * [1] Datta, S. and Satten, G. A. Rank-sum tests for clustered data, * Journal of the American Statistical Association, 100, 908-915 (2005). * * * ----- * Copyright (c) 2009 RNA Biology Laboratory, School of Biological Sciences, * Seoul National University, South Korea. * * This software is provided 'as-is', without any express or implied * warranty. In no event will the authors be held liable for any damages * arising from the use of this software. * * Permission is granted to anyone to use this software for any purpose, * including commercial applications, and to alter it and redistribute it * freely, subject to the following restrictions: * * 1. The origin of this software must not be misrepresented; you must not * claim that you wrote the original software. If you use this software * in a product, an acknowledgment in the product documentation would be * appreciated but is not required. * * 2. Altered source versions must be plainly marked as such, and must not * be misrepresented as being the original software. * * 3. This notice may not be removed or altered from any source * distribution. * */ #include #include #include typedef double rsvalue_t; struct rank_sum_variables { rsvalue_t X; int group; int cluster; }; struct rank_sum_result { double S; double E_S; double Var_S; double Z; }; struct cluster_member; struct cluster_member { size_t index; struct cluster_member *next; }; static int compare_rank_sum_variables(const void *v, const void *w) { struct rank_sum_variables *left = (struct rank_sum_variables *)v; struct rank_sum_variables *right = (struct rank_sum_variables *)w; if (left->X < right->X) return -1; else if (left->X > right->X) return 1; else return 0; } static void free_member_lists(struct cluster_member **lists, size_t nclusters) { size_t i; for (i = 0; i < nclusters; i++) { struct cluster_member *cur, *next; for (cur = lists[i]; cur != NULL; cur = next) { next = cur->next; free(cur); } } free(lists); } static struct cluster_member ** build_cluster_member_lists(const struct rank_sum_variables *vars, size_t length, size_t *clustersizes, size_t nclusters) { struct cluster_member **lists; size_t i; lists = malloc(sizeof(struct cluster_member *) * nclusters); if (lists == NULL) return NULL; for (i = 0; i < nclusters; i++) { lists[i] = NULL; clustersizes[i] = 0; } for (i = length; i > 0; i--) { struct cluster_member **slot, *new; int cluster; cluster = vars[i - 1].cluster; slot = &lists[cluster]; new = malloc(sizeof(struct cluster_member)); if (new == NULL) { free_member_lists(lists, nclusters); return NULL; } new->index = i - 1; new->next = *slot; *slot = new; clustersizes[cluster]++; } return lists; } /** * F_hat = np.array([(sum(X <= xi) + sum(X < xi)) / (2 * N) for xi in X]) */ static double * calc_F_hat(const struct rank_sum_variables *vars, size_t length) { size_t i, j, st; rsvalue_t prev; double *F_hat; F_hat = malloc(sizeof(double) * length); if (F_hat == NULL) return NULL; prev = vars[0].X; st = 0; for (i = 0; i <= length; i++) if (i == length || vars[i].X != prev) { double rankpos; rankpos = (i + st) / 2.0 / length; for (j = st; j < i; j++) F_hat[j] = rankpos; if (i != length) { prev = vars[i].X; st = i; } } return F_hat; } /** * F_prop = np.array([ * sum((sum(X[membs] < xii) + 0.5 * sum(X[membs] == xii)) / len(membs) * for name, membs in clustersets * if name != clusterii) * for xii, clusterii in zip(X, cluster) * ]) */ static double * calc_F_prop(const struct rank_sum_variables *vars, size_t nvars, struct cluster_member *const *clusterlists, size_t nclusters, const size_t *clustersizes) { double *F_prop; size_t i, j; F_prop = malloc(sizeof(double) * nvars); for (i = 0; i < nvars; i++) { rsvalue_t X; double smaller_frac = 0.0; X = vars[i].X; for (j = 0; j < nclusters; j++) { const struct cluster_member *clmem; double num_smaller_members; if (vars[i].cluster == j) continue; num_smaller_members = 0.0; for (clmem = clusterlists[j]; clmem != NULL; clmem = clmem->next) { if (vars[clmem->index].X < X) num_smaller_members += 1.0; else if (vars[clmem->index].X == X) num_smaller_members += 0.5; else break; /* no more smaller values because the list is sorted.*/ } smaller_frac += num_smaller_members / clustersizes[j]; } F_prop[i] = smaller_frac; } return F_prop; } /** * b = 1 + F_prop * a = [sum(grp[membs] * b[membs]) / len(membs) for membs in clustermembers] * c = 1 / (M + 1) * S = c * sum(a) */ static double calc_S(const struct rank_sum_variables *vars, size_t nvars, struct cluster_member *const *clusterlists, size_t nclusters, const size_t *clustersizes) { double *F_prop; size_t i; double sum; F_prop = calc_F_prop(vars, nvars, clusterlists, nclusters, clustersizes); if (F_prop == NULL) return NAN; for (i = 0; i < nvars; i++) F_prop[i] += 1.0; sum = 0.0; for (i = 0; i < nclusters; i++) { const struct cluster_member *clmem; double clsum = 0.0; for (clmem = clusterlists[i]; clmem != NULL; clmem = clmem->next) clsum += F_prop[clmem->index] * vars[clmem->index].group; sum += clsum / clustersizes[i]; } free(F_prop); return sum / (nclusters + 1); } static double * calc_group1_frac(const struct rank_sum_variables *vars, size_t nvars, struct cluster_member *const *clusterlists, size_t nclusters, const size_t *clustersizes) { double *group1_frac; size_t i; group1_frac = malloc(sizeof(double) * nclusters); if (group1_frac == NULL) return NULL; for (i = 0; i < nclusters; i++) { const struct cluster_member *clmem; size_t grp1cnt = 0; for (clmem = clusterlists[i]; clmem != NULL; clmem = clmem->next) grp1cnt += vars[clmem->index].group; group1_frac[i] = (double)grp1cnt / clustersizes[i]; } return group1_frac; } static double calc_E_S(const double *group1_frac, size_t nclusters) { double sum = 0.0; size_t i; for (i = 0; i < nclusters; i++) sum += group1_frac[i]; return sum / 2; } /* * W_hat = np.zeros(M) * a = group1_frac * for i, (membs, n_i) in enumerate(zip(clustermembers, numcluster)): * b = 1 / (n_i * (M + 1)) * c = grp[membs] * (M - 1) * d = sum(m for j, m in enumerate(a) if j != i) * W_hat[i] = b * sum((c - d) * F_hat[membs]) */ static double * calc_W_hat(const struct rank_sum_variables *vars, size_t nvars, struct cluster_member *const *clusterlists, size_t nclusters, const size_t *clustersizes, const double *group1_frac) { double *F_hat; double *W_hat; size_t i; size_t nclusters_1 = nclusters - 1; F_hat = calc_F_hat(vars, nvars); if (F_hat == NULL) return NULL; W_hat = malloc(sizeof(double) * nclusters); if (W_hat == NULL) { free(F_hat); return NULL; } for (i = 0; i < nclusters; i++) { const struct cluster_member *clmem; double b, d, sum; size_t j; b = clustersizes[i] * (nclusters + 1); for (d = 0.0, j = 0; j < nclusters; j++) if (j != i) d += group1_frac[j]; sum = 0.0; for (clmem = clusterlists[i]; clmem != NULL; clmem = clmem->next) sum += F_hat[clmem->index] * ( vars[clmem->index].group * nclusters_1 - d); W_hat[i] = sum / b; } free(F_hat); return W_hat; } /** * E_W = (M / (2 * (M + 1))) * (a - sum(a) / M) */ static double * calc_E_W(const double *group1_frac, size_t nclusters) { double *E_W; double avg_group1_frac; double commonpart; size_t i; E_W = malloc(sizeof(double) * nclusters); if (E_W == NULL) return NULL; avg_group1_frac = 0.0; for (i = 0; i < nclusters; i++) avg_group1_frac += group1_frac[i]; avg_group1_frac /= nclusters; commonpart = (double)nclusters / (2 * (nclusters + 1)); for (i = 0; i < nclusters; i++) E_W[i] = commonpart * (group1_frac[i] - avg_group1_frac); return E_W; } static double calc_var_S(const struct rank_sum_variables *vars, size_t nvars, struct cluster_member *const *clusterlists, size_t nclusters, const size_t *clustersizes, const double *group1_frac) { double *W_hat, *E_W; double var_S; size_t i; W_hat = calc_W_hat(vars, nvars, clusterlists, nclusters, clustersizes, group1_frac); if (W_hat == NULL) return NAN; E_W = calc_E_W(group1_frac, nclusters); if (E_W == NULL) { free(W_hat); return NAN; } var_S = 0.0; for (i = 0; i < nclusters; i++) { double f; f = W_hat[i] - E_W[i]; var_S += f * f; } free(W_hat); free(E_W); return var_S; } struct rank_sum_result * clustered_rank_sum(const rsvalue_t *X, const int *group, const int *cluster, size_t length) { struct rank_sum_variables *vars = NULL; struct cluster_member **clusterlists = NULL; struct rank_sum_result *r; double *group1_frac = NULL; size_t i, nclusters; size_t *clustersizes = NULL; if (length < 1) return NULL; r = malloc(sizeof(struct rank_sum_result)); if (r == NULL) return NULL; vars = malloc(sizeof(struct rank_sum_variables) * length); if (vars == NULL) goto onError; for (i = 0; i < length; i++) { vars[i].X = X[i]; vars[i].group = group[i]; vars[i].cluster = cluster[i]; } qsort(vars, length, sizeof(struct rank_sum_variables), compare_rank_sum_variables); /* Take the largest value of cluster as number of clusters */ nclusters = 0; for (i = 0; i < length; i++) if (cluster[i] > nclusters) nclusters = (int)cluster[i]; nclusters++; clustersizes = malloc(sizeof(size_t) * nclusters); if (clustersizes == NULL) goto onError; clusterlists = build_cluster_member_lists(vars, length, clustersizes, nclusters); if (clusterlists == NULL) goto onError; r->S = calc_S(vars, length, clusterlists, nclusters, clustersizes); if (isnan(r->S)) goto onError; group1_frac = calc_group1_frac(vars, length, clusterlists, nclusters, clustersizes); if (group1_frac == NULL) goto onError; r->E_S = calc_E_S(group1_frac, nclusters); r->Var_S = calc_var_S(vars, length, clusterlists, nclusters, clustersizes, group1_frac); if (isnan(r->Var_S)) goto onError; r->Z = (r->S - r->E_S) / sqrt(r->Var_S); free(vars); free(clustersizes); free_member_lists(clusterlists, nclusters); free(group1_frac); return r; onError: free(r); if (vars != NULL) free(vars); if (clustersizes != NULL) free(clustersizes); if (clusterlists != NULL) free_member_lists(clusterlists, nclusters); if (group1_frac != NULL) free(group1_frac); return NULL; } void free_clustered_rank_sum_result(struct rank_sum_result *result) { free(result); } #ifdef CRANKSUM_TEST int main(void) { rsvalue_t X[9] = {1, 4, 2, 4, 6, 7, 4, 7, 8}; int cluster[9] = {0, 0, 1, 1, 1, 1, 2, 2, 2}; int grp[9] = {0, 1, 0, 0, 1, 1, 1, 0, 1}; struct rank_sum_result *r; r = clustered_rank_sum(X, grp, cluster, 9); if (r == NULL) { fprintf(stderr, "failed.\n"); return -1; } printf("S: %g\n", r->S); printf("E_S: %g\n", r->E_S); printf("Var_S: %g\n", r->Var_S); printf("Z: %g\n", r->Z); free_clustered_rank_sum_result(r); return 0; } #endif