#include #include #include #include #include #include #include typedef struct { void *buf; uint64_t size; uint64_t increment; } autospace; typedef struct { void *buf; uint64_t totalsize; uint64_t reserved0; uint64_t elemsize, nelem, maxelem; } autoarray; #define STROFFMASK 0xFFFFFFFFFFFFFFull typedef struct { uint8_t string_offset[7]; /* little-endian offset in stringstorage */ uint8_t tagnotvalue; /* 1 = tag key, 0 = value */ uint32_t nodecount, waycount, relcount; uint32_t tagaux_index; /* index in tagauxary of corresponding tag */ uint32_t next; /* index of next entry with same string */ } stringstat; typedef struct { uint64_t num_min, num_max; uint32_t total, strcount, numcount, boolcount; } tagaux; #define FILEBUFSZ 1048576 static char fbuf[FILEBUFSZ+1]; autospace stringstorage= { .buf= NULL, .size= 0, .increment= 8192 }; uint64_t stringstoreend= 0, stringcount= 0; uint32_t stringstorecount= 0, usedindices= 0, bucketmax= 0; #define STRINDEXSIZE (1<<26) uint32_t strbucketindex[STRINDEXSIZE]; autoarray stringstatary= { .buf= NULL, .elemsize= sizeof(stringstat), .maxelem= 1024, .nelem= 0 }; autoarray tagauxary= { .buf= NULL, .elemsize= sizeof(tagaux), .maxelem= 256, .nelem= 0 }; uint64_t readtotal= 0, movetotal= 0, minid= 0xFFFFFFFFFFFFFFFFull, maxid= 0; unsigned entrycount= 0; time_t starttime; int readentry(size_t *readoff, uint64_t *id); void skipquoted(char **c); uint64_t getnum(char **c); void newtag(int type, const char *key, const char *value); uint32_t hash(const char *str); void findstring(const char *str, int entrytype, int iskey, uint32_t *auxind); int compare_stringstat(const void *d1, const void *d2); void printstatus(int signal); void printstatistics(void); void autoalloc(autospace *sp); void autoextend(autospace *sp); void autoappend(autoarray *ary, const void *elem); void autoarysort(autoarray *ary, int (*compar)(const void *, const void *)); int main(int argc, char **argv) { uint64_t id; size_t readlen, readoff; int status, entrytype; if( argc > 1 && (!strcmp(argv[1], "-h") || !strcmp(argv[1], "--help")) ) { fprintf(stderr, "Usage: bzip2 -d -c foo.osm.bz2 | osmalltagstat | tee foo.tagstat\n"); return 1; } starttime= time(NULL); signal(SIGUSR1, printstatus); /* note: sigaction() has no effect */ readlen= fread(fbuf, 1, FILEBUFSZ, stdin); if( readlen == 0 ) { fprintf(stderr, "Could not read from stdin.\n"); return 2; } readoff= 0; readtotal= readlen; movetotal= 0; entrycount= 0; while( readlen ) { readlen += readoff; fbuf[readlen]= 0; readoff= 0; while(1) { entrytype= readentry(&readoff, &id); if( entrytype == -2 ) break; if( entrytype >= 0 ) { ++entrycount; if( id < minid ) minid= id; if( id > maxid ) maxid= id; } } if( readoff == 0 ) { if( readlen == FILEBUFSZ ) { fprintf(stderr, "Error: node too large for buffer, at offset %llu.\n", readtotal - readlen); // fprintf(stderr, "%s\n", fbuf); } break; } memmove(fbuf, fbuf+readoff, readlen-readoff); movetotal += readlen - readoff; readoff= readlen - readoff; readlen= fread(fbuf+readoff, 1, FILEBUFSZ-readoff, stdin); readtotal += readlen; // fprintf(stderr, "%lu bytes left, read %lu bytes\n", (unsigned long)readoff, (unsigned long)readlen); } printstatistics(); return 0; } /* <- -2 end of buffer, -1 none, 0 node, 1 way, 2 relation */ int readentry(size_t *readoff, uint64_t *id) { char *read= fbuf + *readoff; char *tagkey, *tagval, *keyend, *valend; int type; char tagattr, quote, keyendc, valendc; while( *read && *read != '<' ) ++read; if( ! *read ) return -2; *readoff= (size_t)(read - fbuf); ++read; if( !strncmp(read, "node ", 5) ) { type= 0; read += 4; } else if( !strncmp(read, "way ", 4) ) { type= 1; read += 3; } else if( !strncmp(read, "relation ", 9) ) { type= 2; read += 8; } else type= -1; while( *read && *read != '>' && (*read != ' ' || strncmp(read+1, "id=", 3)) && (*read != '/' || read[1] != '>') ) if( *read == '"' || *read == '\'' ) skipquoted(&read); else ++read; if( ! *read ) return -2; if( *read == ' ' ) { /* must be " id=" */ read += 4; *id= getnum(&read); while( *read && *read != '>' && (*read != '/' || read[1] != '>') ) if( *read == '"' || *read == '\'' ) skipquoted(&read); else ++read; } else *id= 0; if( *read == '/' ) { /* XML tag without content - no OSM tags */ read += 2; *readoff= (size_t)(read - fbuf); return type; } if( type < 0 ) { ++read; *readoff= (size_t)(read - fbuf); return type; } /* *read must now be '>', so we may have sub-tags such as */ while( 1 ) { while( *read && *read != '<' ) ++read; if( ! *read ) return -2; if( read[1] == '/' ) { /* etc. */ read += 2; while( *read && *read != '>' ) ++read; if( ! *read ) return -2; ++read; *readoff= (size_t)(read - fbuf); return type; } ++read; if( !strncmp(read, "tag ", 4) ) { read += 3; tagkey= tagval= NULL; keyendc= valendc= 0; while( *read && *read != '>' ) { if( *read == ' ' && (read[1] == 'k' || read[1] == 'v') && read[2] == '=' ) { tagattr= read[1]; read += 3; if( *read == '"' || *read == '\'' ) quote= *read++; else quote= 0; if( tagattr == 'k' ) tagkey= read; else tagval= read; if( quote ) { while( *read && *read != quote ) ++read; if( ! *read ) { if( keyendc ) *keyend= keyendc; return -2; } if( tagattr == 'k' ) { keyend= read; keyendc= quote; } else { valend= read; valendc= quote; } if( *read ) *read++ = 0; } else { while( *read && isalnum(*read) ) ++read; if( ! *read ) { if( keyendc ) *keyend= keyendc; return -2; } if( tagattr == 'k' ) { keyend= read; keyendc= *read; } else { valend= read; valendc= *read; } if( *read == '>' ) { *read++ = 0; break; } if( *read ) *read++ = 0; } } else if( *read == '"' || *read == '\'' ) skipquoted(&read); else ++read; } newtag(type, tagkey, tagval); if( keyendc ) *keyend= keyendc; if( valendc ) *valend= valendc; } while( *read && *read != '>' ) ++read; } } void skipquoted(char **c) { char *read= *c; char quote; if( *read != '"' && *read != '\'' ) { if( *read ) ++*c; return; } quote= *read; ++read; while( *read && *read != quote ) ++read; if( *read ) ++read; *c= read; } uint64_t getnum(char **c) { char *read= *c; uint64_t num; if( *read == '"' || *read == '\'' ) ++read; num= strtoull(read, (char**)&read, 10); if( *read == '"' || *read == '\'' ) ++read; *c= read; return num; } /* enter new tag into lists and determine value type */ void newtag(int type, const char *key, const char *value) { tagaux *tagauxbase; const char *numstr; uint64_t numval; uint32_t auxind; int isnew; if( ! key || ! value ) return; findstring(key, type, 1, &auxind); tagauxbase= (tagaux*)tagauxary.buf; ++tagauxbase[auxind].total; for( numstr= value; isspace(*numstr); ++numstr ); if( *numstr == '0' && (!numstr[1] || isspace(numstr[1])) || *numstr >= '1' && *numstr <= '9' ) { numval= strtoull(numstr, (char**)&numstr, 10); while( isspace(*numstr) ) ++numstr; if( *numstr == 0 ) { if( numval < tagauxbase[auxind].num_min ) tagauxbase[auxind].num_min= numval; if( numval > tagauxbase[auxind].num_max ) tagauxbase[auxind].num_max= numval; ++tagauxbase[auxind].numcount; return; } } if( !strcasecmp(value, "yes") || !strcasecmp(value, "no") ) ++tagauxbase[auxind].boolcount; else { ++tagauxbase[auxind].strcount; findstring(value, type, 0, &auxind); } } /* One-at-a-time hash by Bob Jenkins */ uint32_t hash(const char *str) { uint32_t h= 0; while( *str ) { h += *str++; h += (h << 10); h ^= (h >> 6); } h += (h << 3); h ^= (h >> 11); h += (h << 15); return h; } /* find or add string to stringstorage, stringstatary and tagauxary */ void findstring(const char *str, int entrytype, int iskey, uint32_t *auxind) { static int initdone= 0; stringstat newstr, *strstatbase; tagaux newaux; char *stringbase; uint64_t stroff; size_t slen; uint32_t h, strstatind, newcount; unsigned collisioncount; int neednewstrstat, newstring, samekind, samekey; if( ! initdone ) { memset(strbucketindex, 0xFF, 4*STRINDEXSIZE); initdone= 1; } ++stringcount; stringbase= (char*)stringstorage.buf; strstatbase= (stringstat*)stringstatary.buf; h= hash(str); strstatind= strbucketindex[h % STRINDEXSIZE]; collisioncount= 0; if( strstatind == 0xFFFFFFFF ) { strbucketindex[h % STRINDEXSIZE]= stringstatary.nelem; neednewstrstat= 1; newstring= 1; ++usedindices; } else { neednewstrstat= 0; newstring= 1; while( 1 ) { samekind= !!iskey == !!strstatbase[strstatind].tagnotvalue; samekey= iskey || strstatbase[strstatind].tagaux_index == *auxind; if( newstring ) { stroff= *(uint64_t*)strstatbase[strstatind].string_offset & STROFFMASK; newstring= strcmp(stringbase+stroff, str); if( ! newstring && samekind && samekey ) break; } else { if( stroff == (*(uint64_t*)strstatbase[strstatind].string_offset & STROFFMASK) && samekind && samekey ) break; } if( strstatbase[strstatind].next == 0xFFFFFFFF ) { strstatbase[strstatind].next= stringstatary.nelem; neednewstrstat= 1; break; } strstatind= strstatbase[strstatind].next; ++collisioncount; } } if( newstring ) { slen= strlen(str); while( stringstorage.size < stringstoreend + slen + 1 ) autoextend(&stringstorage); stringbase= (char*)stringstorage.buf; strncpy((char*)stringstorage.buf+stringstoreend, str, stringstorage.size-stringstoreend); stroff= stringstoreend; stringstoreend += slen + 1; ++stringstorecount; } if( neednewstrstat ) { if( iskey ) { newstr.tagaux_index= tagauxary.nelem; newaux.total= newaux.numcount= newaux.boolcount= newaux.strcount= 0; newaux.num_min= 0xFFFFFFFFFFFFFFFFull; newaux.num_max= 0; autoappend(&tagauxary, &newaux); } else newstr.tagaux_index= *auxind; *(uint64_t*)newstr.string_offset= stroff; newstr.tagnotvalue= iskey; newstr.nodecount= newstr.waycount= newstr.relcount= 0; if( entrytype == 0 ) newstr.nodecount= 1; else if( entrytype == 1 ) newstr.waycount= 1; else if( entrytype == 2 ) newstr.relcount= 1; newstr.next= 0xFFFFFFFF; strstatind= stringstatary.nelem; autoappend(&stringstatary, &newstr); strstatbase= (stringstat*)stringstatary.buf; if( ++collisioncount > bucketmax ) bucketmax= collisioncount; } else { if( entrytype == 0 ) ++strstatbase[strstatind].nodecount; else if( entrytype == 1 ) ++strstatbase[strstatind].waycount; else if( entrytype == 2 ) ++strstatbase[strstatind].relcount; } if( iskey ) *auxind= strstatbase[strstatind].tagaux_index; } /* Comparison function for sorting array of string statistics structs in order of declining occurrence of the tag (primarily) and its values. Tag keys sort before their values. */ int compare_stringstat(const void *d1, const void *d2) { stringstat *s1= (stringstat*)d1, *s2= (stringstat*)d2; tagaux *a1, *a2; int64_t valuetotaldiff; if( s1->tagaux_index != s2->tagaux_index ) { a1= (tagaux*)tagauxary.buf + s1->tagaux_index; a2= (tagaux*)tagauxary.buf + s2->tagaux_index; if( a1->total > a2->total ) return -1; if( a1->total < a2->total ) return 1; /* break ties to group by tag key */ if( a1 < a2 ) return -1; return a1 > a2; } if( s1->tagnotvalue ) return -1; if( s2->tagnotvalue ) return 1; valuetotaldiff= (int64_t)s2->nodecount + s2->waycount + s2->relcount - s1->nodecount - s1->waycount - s1->relcount; if( valuetotaldiff < 0 ) return -1; return valuetotaldiff > 0; } void printstatus(int signal) { FILE *out= signal ? stderr : stdout; fprintf(out, "Read %llu bytes in %u seconds, moved %llu within buffer\n", readtotal, (unsigned)(time(NULL) - starttime), movetotal); fprintf(out, "Found %u entries with IDs from %llu to %llu\n", entrycount, minid, maxid); fprintf(out, "Found %llu strings, %llu unique, %llu bytes\n", stringcount, stringstorecount, stringstoreend); fprintf(out, "%u of %u string indices, %u string stat structs, max bucket size %u\n", usedindices, STRINDEXSIZE, stringstatary.nelem, bucketmax); fprintf(out, "\n"); } void printstatistics(void) { stringstat *statbase= (stringstat*)stringstatary.buf; tagaux *auxbase= (tagaux*)tagauxary.buf; const char *stringbase= (const char *)stringstorage.buf; uint64_t ind; time_t t; uint32_t auxind= 0xFFFFFFFF; int iskey; printstatus(0); printstatus(1); fprintf(stderr, "Sorting arrays for output...\n"); t= time(NULL); autoarysort(&stringstatary, compare_stringstat); fprintf(stderr, "done. (%u seconds to sort)\n", (unsigned)(time(NULL) - t)); for( ind= 0; ind < stringstatary.nelem; ++ind ) { iskey= statbase[ind].tagnotvalue; printf("%s%s\t\t%u, %u, %u\n", iskey ? "" : " ", stringbase + (*(uint64_t*)statbase[ind].string_offset & STROFFMASK), statbase[ind].nodecount, statbase[ind].waycount, statbase[ind].relcount); if( iskey || statbase[ind].tagaux_index != auxind ) { if( ! iskey ) printf("?? unknown key ??\n"); // printf("aux ind changed from %u to %u\n", // auxind, statbase[ind].tagaux_index); auxind= statbase[ind].tagaux_index; if( auxbase[auxind].boolcount > 0 ) printf(" %u boolean values\n", auxbase[auxind].boolcount); if( auxbase[auxind].numcount ) printf(" %u numeric values from %llu to %llu\n", auxbase[auxind].numcount, auxbase[auxind].num_min, auxbase[auxind].num_max); if( auxbase[auxind].strcount > 0 ) printf(" %u string values:\n", auxbase[auxind].strcount); } } printf("Total runtime: %u seconds\n", (unsigned)(time(NULL) - starttime)); } /*! \brief Allocate buffer to be enlarged in the future Malloc()s a buffer with the size given by the size entry of its argument. \param sp Pointer to struct which contains the requested size and will receive the pointer to the buffer. */ void autoalloc(autospace *sp) { sp->buf= malloc(sp->size); } /*! \brief Enlarge buffer Calls realloc() to enlarge a buffer. The increment entry in its argument determines by how many bytes the buffer is extended. If it is zero, the size is doubled. If the reallocation fails, the size entry is left unchanged. WARNING: The address of the allocated space may change as a result of this function, rendering all pointers into it invalid. \param Pointer to struct containing the pointer to the old buffer, the current size and the increment. */ void autoextend(autospace *sp) { void *newbuf; uint64_t newsize; if( sp->increment ) newsize= sp->size + sp->increment; else newsize= sp->size * 2; newbuf= realloc(sp->buf, newsize); if( newbuf ) { sp->buf= newbuf; sp->size= newsize; } } /*! \brief Append element to array after reallocating as needed Creates or enlarges an array as needed before appending an element. On the first call, the buf pointer in the array struct must be NULL, but the elemsize and maxelem entries should be set to the element size and maximum number of elements to allocate space for. Subsequent calls append one element pointed to by the second argument. If necessary, the array is enlarged using autoextend() first. If the reallocation fails, the element cannot be appended. WARNING: The address of the space allocated for the array may change as a result of this function, rendering all pointers into it invalid. \param ary Pointer to array data struct containing pointer, element size, current and maximum allocated number of elements \param elem Pointer to new element to append */ void autoappend(autoarray *ary, const void *elem) { if( !ary->buf ) { if( !ary->elemsize ) ary->elemsize= sizeof(int); if( !ary->maxelem ) ary->maxelem= 8; ary->reserved0= 0; ary->totalsize= ary->elemsize * ary->maxelem; autoalloc((autospace*)ary); if( ary->buf ) { memcpy(ary->buf, elem, ary->elemsize); ary->nelem= 1; } else ary->nelem= 0; } else { if( ary->nelem >= ary->maxelem ) { autoextend((autospace*)ary); ary->maxelem= ary->totalsize / ary->elemsize; if( ary->nelem >= ary->maxelem ) /* realloc failed */ return; } memcpy((char*)ary->buf+ary->elemsize*ary->nelem, elem, ary->elemsize); ++ary->nelem; } } /* wrapper around qsort */ void autoarysort(autoarray *ary, int (*compar)(const void *, const void *)) { qsort(ary->buf, ary->nelem, ary->elemsize, compar); }