/* Muac - A fast algorithm for the 2d KS test (v1.1, 13 June 1999). Copyright (C) 1998 Andrew Cooke (Jara Software) Comments to jara@andrewcooke.free-online.co.uk This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA These conditions also apply to any other files distributed with this program. */ /* --- the main routines for a fast 2d ks test -------- */ #include "stdlib.h" #include "stdio.h" /* --- global structs and routines -------------------- */ #include "muac.h" #undef DEBUG /* --- local structs and routines --------------------- */ typedef struct node { struct node *parent ; struct node *left ; struct node *right ; MuacRecord *data ; float score ; float min ; float max ; float delta ; } Node ; float sweep( MuacRecord *in, int nPoints, int reverse, float *score ) ; void reCalc( Node *node ) ; Node *addNode( MuacRecord *data, Node *root, Node *treeData, int *treeIndex, float *score ) ; Node *newNode( Node *parent, Node **side, MuacRecord *data, Node *treeData, int *treeIndex, float *score ) ; Node *allocTree( int nPoints, int *treeIndex ) ; Node *allocNode( Node *treeData, int *treeIndex ) ; void freeTree( Node *treeData, int *treeIndex ) ; void quickSort( MuacRecord *in, int nPoints ) ; void ranSort( MuacRecord *in, int start, int end ) ; int ranPartition( MuacRecord *in, int start, int end ) ; void exchange( MuacRecord *in, int a, int b ) ; int iRandom( int left, int right ) ; void printTree( Node *root, FILE *out ) ; void printNode( Node* node, FILE *out, int depth ) ; char* padding( int depth ) ; #define MAX( a, b ) (a > b ? a : b) #define MIN( a, b ) (a < b ? a : b) /* --- main routine ----------------------------------- */ /* the main 2-dimensional ks routine, this expects data to be sorted by y. the routine unsortedMuac (below) provides a less demanding interface. */ float muac( MuacRecord *in, int n1, int n2 ) { float maxDelta ; float *score ; int nPoints ; score = malloc( 2 * sizeof( float )) ; score[0] = 1.0 / n1 ; score[1] = -1.0 / n2 ; nPoints = n1 + n2 ; maxDelta = sweep( in, nPoints, 0, score ) ; maxDelta = MAX( maxDelta, sweep( in, nPoints, 1, score )) ; free( score ) ; /* fix for v1.1 */ return maxDelta ; } /* --- sort before processing ------------------------- */ /* for most users, this routine is the simplest interface to use, whether data are sorted or not. a randomized quicksort is used to sort the data by y - it will be slower than calling muac directly if you have already sorted data, but it is not a significant delay in most cases (in contrast, an ordinary quicksort is notoriously slow with sorted data). if you are obsessed with speed: call muac directly with sorted data. if you are sure that your data are not pre/almost sorted, check out qsort(3) (man 3 qsort) and sort data yourself before calling muac. */ float unsortedMuac( float *x1, float *y1, int n1, float *x2, float *y2, int n2 ) { int n ; int i ; MuacRecord *in ; float maxDelta ; n = n1 + n2 ; in = (MuacRecord*) malloc( n * sizeof( MuacRecord )) ; for ( i = 0 ; i < n1 ; ++i ) { in[i].x = x1[i] ; in[i].y = y1[i] ; in[i].type = 0 ; } for ( i = 0 ; i < n2 ; ++i ) { in[n1 + i].x = x2[i] ; in[n1 + i].y = y2[i] ; in[n1 + i].type = 1 ; } quickSort( in, n ) ; maxDelta = muac( in, n1, n2 ) ; free( in ) ; return maxDelta ; } /* --- sweep through the data ------------------------- */ /* this is a single pass through the data - either top to bottom or bottom to top - calculating the statistic (max cumulative difference) for the top or bottom two quadrants, respectively. */ /* re-using the total delta (as rightOrigin) allows us to calculate the right hand quadrant from the left (it's a simple offset that is non-zero if there are more of one type than the other) - this optimization is not in the corresponding python code. */ float sweep( MuacRecord *in, int nPoints, int reverse, float *score ) { int i ; int iStart ; int iEnd ; int iStep ; float maxDelta = 0.0 ; float rightOrigin ; Node *root ; Node *nextNode ; Node *treeData ; int treeIndex ; treeData = allocTree( nPoints, &treeIndex ) ; if ( reverse ) { iStart = nPoints - 1 ; iEnd = -1 ; iStep = -1 ; } else { iStart = 0 ; iEnd = nPoints ; iStep = 1 ; } /* second arg is same as return value (cheat for orphan node) */ newNode( 0, &root, in + iStart, treeData, &treeIndex, score ) ; iStart += iStep ; for ( i = iStart ; i != iEnd ; i += iStep ) { nextNode = addNode( in + i, root, treeData, &treeIndex, score ) ; reCalc( nextNode ) ; #ifdef DEBUG printTree( root, stdout ) ; #endif rightOrigin = root->delta ; maxDelta = MAX( maxDelta, MAX( MAX( root->max, root->max - rightOrigin ), -1 * MIN( root->min, root->min - rightOrigin ))) ; } freeTree( treeData, &treeIndex ) ; return maxDelta ; } /* --- calculate tree data ---------------------------- */ /* each time we add a node we need to work back up the tree, recalculating the values for each sub node. */ void reCalc( Node *node ) { for ( node = node->parent ; node ; node = node->parent ) { if ( node->left && node->right ) { node->delta = node->left->delta + node->score + node->right->delta ; node->min = MIN( MIN( node->left->min, node->left->delta + node->score ), node->left->delta + node->score + node->right->min ) ; node->max = MAX( MAX( node->left->max, node->left->delta + node->score ), node->left->delta + node->score + node->right->max ) ; } else if ( node->left ) { node->delta = node->left->delta + node->score ; node->min = MIN( node->left->min, node->left->delta + node->score ) ; node->max = MAX( node->left->max, node->left->delta + node->score ) ; } else { node->delta = node->score + node->right->delta ; node->min = MIN( MIN( 0, node->score ), node->score + node->right->min ) ; node->max = MAX( MAX( 0, node->score ), node->score + node->right->max ) ; } } } /* --- descend tree, adding new node at bottom -------- */ Node *addNode( MuacRecord *data, Node *root, Node *treeData, int *treeIndex, float *score ) { Node *parent ; Node *node = root ; Node **side ; while ( node ) { parent = node ; if ( data->x < node->data->x ) { side = &(node->left) ; node = node->left ; } else { side = &(node->right) ; node = node->right ; } } return newNode( parent, side, data, treeData, treeIndex, score ) ; } Node *newNode( Node *parent, Node **side, MuacRecord *data, Node *treeData, int *treeIndex, float *score ) { Node *node ; node = allocNode( treeData, treeIndex ) ; *side = node ; node->parent = parent ; node->data = data ; node->score = score[data->type] ; node->delta = node->score ; node->max = MAX( 0, node->delta ) ; node->min = MIN( 0, node->delta ) ; node->left = NULL ; node->right = NULL ; return node ; } /* --- manage our own chunk of memory ----------------- */ Node *allocTree( int nPoints, int *treeIndex ) { *treeIndex = 0 ; return (Node*) calloc( nPoints, sizeof( Node ) ) ; } Node *allocNode( Node *treeData, int *treeIndex ) { return &(treeData[(*treeIndex)++]) ; } void freeTree( Node *treeData, int *treeIndex ) { *treeIndex = 0 ; free( treeData ) ; } /* --- randomized quicksort --------------------------- */ /* randomized because i suspect people will use this routine without thinking, giving data already sorted, and then be surprised at the slow response. */ /* probably not the fastest implementation in the world (especially with randomisation), but (a) i don't like pointer arithmetic and (b) the ks calculation is (by a constant factor) slower than this anyway. translated to c from cormen et al. */ void quickSort( MuacRecord *in, int nPoints ) { #ifdef DEBUG int i ; #endif ranSort( in, 0, nPoints - 1 ) ; #ifdef DEBUG for ( i = 0 ; i < nPoints ; ++i ) { fprintf( stderr, "%d %f\n", i, in[i].y ) ; } #endif } /* tail recursion applied - would gcc have optimised this if the while was an if and the last statement called ranSort with the right hand half of the data? */ void ranSort( MuacRecord *in, int start, int end ) { int pivot ; while ( start < end ) { pivot = ranPartition( in, start, end ) ; ranSort( in, start, pivot ) ; start = pivot + 1 ; } } /* this could be improved by using median of 3 (see cormen et al.) */ int ranPartition( MuacRecord *in, int start, int end ) { float y ; int left, right ; int r ; r = iRandom( start, end ) ; y = in[r].y ; left = start - 1 ; right = end + 1 ; for ( ; ; ) { do { --right ; } while( in[right].y > y ) ; do { ++left ; } while( in[left].y < y ) ; if ( left < right ) { exchange( in, left, right ) ; } else { return right ; } } } void exchange( MuacRecord *in, int a, int b ) { float xy ; char type ; xy = in[a].x ; in[a].x = in[b].x ; in[b].x = xy ; xy = in[a].y ; in[a].y = in[b].y ; in[b].y = xy ; type = in[a].type ; in[a].type = in[b].type ; in[b].type = type ; } int iRandom( int left, int right ) { return left + (int) ( ( 1.0 + right - left ) * rand( ) / ( RAND_MAX + 1.0 ) ) ; } /* --- print tree for debugging ----------------------- */ void printTree( Node *root, FILE *out ) { fprintf( out, "\n" ) ; printNode( root, out, 0 ) ; } void printNode( Node* node, FILE *out, int depth ) { fprintf( out, "%sscr: %f min %f, max %f, dlt %f\n", padding(depth), node->score, node->min, node->max, node->delta ) ; if ( node->left ) { printNode( node->left, out, depth + 1 ) ; } if ( node->right ) { printNode( node->right, out, depth + 1 ) ; } fflush( out ) ; } char* space = " " ; char* padding( int depth ) { return ( space + strlen( space ) - depth ) ; }