// // trees.c // Count classification/regression/decision support trees // // Steven B. Gillispie // University of Washington, Department of Statistics // August 2017 // Copyright (c) 2017 University of Washington. All rights reserved. // #include #include #include #define N 20 /* Maximum number of leaves */ #define NUMPATHS N /* Number of different path lengths */ #define QUANTILES 20 /* Number of probability distribution histogram divisions */ //#define STDDEV /* If defined use std dev for normVar, otherwise variance */ struct UnlabeledTree { // Data structure for an unlabeled tree long long count; // Count of labeled versions of the tree short pathLengths[NUMPATHS]; // Edge lengths of paths from root to leaves double normVar; // Normalized (by tree maximum) path lengths variation: // std dev or variance according to STDDEV }; typedef struct UnlabeledTree UnlabeledTree; struct TreeSet { // Data structure for a list of unlabeled trees long long numTrees; // Count of unlabeled trees in the set long long multiplier; // GCD of tree counts UnlabeledTree* treeList; // List of unlabeled trees in the set }; typedef struct TreeSet TreeSet; long gUnlTreesCounts[N+1]; // Number of possible unlabeled trees TreeSet gTreeSet[N+1]; // Master set of tree sets of each n (0 and 1 are unused) double gQuantiles[QUANTILES]; // Histogram of the tree balanced-ness by variation long long Choose( long n, long k ); double CalcTreeVariation( int n, UnlabeledTree* aTreePtr ); long long ReduceCounts( TreeSet* aTreeSetPtr ); long long Choose( long n, long k ) // Binomial coefficient C(n,k) { long long result; // Returned value long i; // Index through factors result = 1; for (i=0; ipathLengths[k]; sumPathLengthsSqrd += aTreePtr->pathLengths[k] * aTreePtr->pathLengths[k]; } mean = 1.0 * sumPathLengths / n; #ifdef STDDEV aTreePtr->normVar = sqrt( 1.0 * sumPathLengthsSqrd / n - mean * mean ); #else aTreePtr->normVar = 1.0 * sumPathLengthsSqrd / n - mean * mean; #endif return ( aTreePtr->normVar ); } long long ReduceCounts( TreeSet* aTreeSetPtr ) // Divide the counts in the tree set by their GCD and return that GCD { long long minCounts; // Minimum tree counts long long i; // Index through trees long long curCount; // Current tree count // Find the minimum value of the tree set counts (the likely GCD) minCounts = aTreeSetPtr->treeList[0].count; for (i=1; inumTrees; i++) { if ( aTreeSetPtr->treeList[i].count < minCounts ) { minCounts = aTreeSetPtr->treeList[i].count; } } // Test that the minimum value is the GCD for (i=0; inumTrees; i++) { curCount = aTreeSetPtr->treeList[i].count; if ( ((curCount/minCounts) * minCounts) != curCount ) { printf( "Minimum wasn't GCD: curCount= %lld, minCounts= %lld\n", curCount, minCounts ); break; } } if ( i >= aTreeSetPtr->numTrees ) { // Minimum was GCD // Divide the tree counts by the GCD for (i=0; inumTrees; i++) { //printf("\told count: %d", aTreeSetPtr->treeList[i].count); aTreeSetPtr->treeList[i].count /= minCounts; //printf("\tnew count: %d\n", aTreeSetPtr->treeList[i].count); } } // Return the GCD return ( minCounts ); } int main(int argc, const char * argv[]) { int n; // Index through leaf counts int s; // Index through left tree branch size UnlabeledTree* curTreePtr; // Pointer to current tree UnlabeledTree* leftTreePtr; // Pointer to a previous tree being added as a left subtree UnlabeledTree* rightTreePtr; // Pointer to a previous tree being added as a right subtree int i, j, k; // Generic index variables long long numberings; // Increase in labels at this step // Calculate the possible numbers of unlabeled trees gUnlTreesCounts[0] = 0; gUnlTreesCounts[1] = 1; for (n=2; n<=N; n++) { gUnlTreesCounts[n] = 0; for (i=1; i<=(n/2); i++) { gUnlTreesCounts[n] += gUnlTreesCounts[i] * gUnlTreesCounts[n-i]; } printf( "Unlabeled tree count for n = %d is %ld\n", n, gUnlTreesCounts[n] ); } // Initialize gTreeSet with its starting 2-tree for (n=0; n<=N; n++) { gTreeSet[n].numTrees = 0; gTreeSet[n].multiplier = 1; gTreeSet[n].treeList = NULL; } gTreeSet[2].numTrees = 1; gTreeSet[2].multiplier = 1; gTreeSet[2].treeList = ( UnlabeledTree* )calloc( 1, sizeof( UnlabeledTree ) ); if ( ! gTreeSet[2].treeList ) { printf( "Couldn't allocate memory for n-tree %d (need %d tree)\n", 2, 1 ); } gTreeSet[2].treeList[0].count = 1; gTreeSet[2].treeList[0].pathLengths[0] = 1; gTreeSet[2].treeList[0].pathLengths[1] = 1; gTreeSet[2].treeList[0].normVar = 0.0; // Recursively calculate the rest of the trees for (n=3; n<=N; n++) { // Allocate memory to hold the tree gTreeSet[n].treeList = ( UnlabeledTree* )calloc( gUnlTreesCounts[n], sizeof( UnlabeledTree ) ); if ( ! gTreeSet[n].treeList ) { printf( "Couldn't allocate memory for n-tree %d (need %ld trees)\n", n, gUnlTreesCounts[n] ); } gTreeSet[n].numTrees = 0; // Do the single added leaf for (j=0; j<(gTreeSet[n-1].numTrees); j++) { if ( gTreeSet[n].numTrees >= gUnlTreesCounts[n] ) { printf( "Number of trees exceeds storage space!\n" ); } else { curTreePtr = &(gTreeSet[n].treeList[gTreeSet[n].numTrees]); rightTreePtr = &(gTreeSet[n-1].treeList[j]); curTreePtr->count = n * gTreeSet[n-1].multiplier * rightTreePtr->count; curTreePtr->pathLengths[0] = 1; for (k=1; kpathLengths[k] = rightTreePtr->pathLengths[k-1] + 1; } CalcTreeVariation( n, curTreePtr ); gTreeSet[n].numTrees++; } } // Do the more complicated added leaves curTreePtr = &(gTreeSet[n].treeList[gTreeSet[n].numTrees]); for (s=2; s<(n/2 + 1); s++) { numberings = Choose( n, s ); if ( 2*s == n ) { numberings /= 2; // Account for symmetric duplication } // Run through the possible left subtrees for (i=0; i<(gTreeSet[s].numTrees); i++) { leftTreePtr = &(gTreeSet[s].treeList[i]); // Run through the possible right subtrees for (j=0; j<(gTreeSet[n-s].numTrees); j++) { if ( gTreeSet[n].numTrees >= gUnlTreesCounts[n] ) { printf( "Number of trees exceeds storage space!\n" ); } else { rightTreePtr = &(gTreeSet[n-s].treeList[j]); // Calculate the new count curTreePtr->count = numberings * gTreeSet[s].multiplier * gTreeSet[n-s].multiplier * leftTreePtr->count * rightTreePtr->count; if ( curTreePtr->count < 0 ) { printf( "Integer overflow: n = %d, tree = %lld:\n", n, gTreeSet[n].numTrees ); printf( "\tnumberings C(%d,%d) = %lld\n", n, s, numberings ); printf( "\tleft multiplier = %lld\n", gTreeSet[s].multiplier ); printf( "\tright multiplier = %lld\n", gTreeSet[n-s].multiplier ); printf( "\tleft count = %lld\n", leftTreePtr->count ); printf( "\tright count = %lld\n", rightTreePtr->count ); } // Add the new path lengths for (k=0; kpathLengths[k] = leftTreePtr->pathLengths[k] + 1; } for (k=s; kpathLengths[k] = rightTreePtr->pathLengths[k-s] + 1; } CalcTreeVariation( n, curTreePtr ); gTreeSet[n].numTrees++; curTreePtr = &(gTreeSet[n].treeList[gTreeSet[n].numTrees]); } } } } // Normalize the variation for the gTreeSet[n] set of trees { double maxVar; // Maximum tree set variation // Find the maximum variation maxVar = 0.0; for (i=0; inormVar > maxVar ) { maxVar = curTreePtr->normVar; } } if ( maxVar > 0.0 ) { // Normalize the tree variations for (i=0; icount ); printf( "%d", curTreePtr->pathLengths[0] ); for (k=1; kpathLengths[k] ); } #ifdef STDDEV printf( ") Std dev = %f\n", curTreePtr->normVar ); #else printf( ") Variance = %f\n", curTreePtr->normVar ); #endif } } */ // Output the quantiles for the n tree sets { long totalCounts; // Sum of the (reduced) counts in the tree set double curVar; // Variation of the current tree double intVar; // Integer portion of the rescaled variation int index; // Quantile index printf( "\n\n" ); for (n=2; n<=N; n++) { printf( "n = %d:\n", n ); // Get the total counts for all the trees in the tree set totalCounts = 0; for (i=0; i<(gTreeSet[n].numTrees); i++) { curTreePtr = &(gTreeSet[n].treeList[i]); totalCounts += curTreePtr->count; } // Fill in the histogram for (k=0; knormVar * QUANTILES; intVar = trunc( curVar ); if ( (intVar == curVar) && (intVar > 0.0) ) { index = ( int )intVar - 1; } else { index = ( int )intVar; } // Update the histogram gQuantiles[index] += (1.0 * curTreePtr->count / totalCounts); } // Print the histogram values for (k=0; k