epa/tr-stat.cc

Go to the documentation of this file.
00001 // Generate statistics from EPA traces 
00002 //   http://ita.ee.lbl.gov/html/contrib/EPA-HTTP.html
00003 //
00004 // All we need to know: 
00005 // 
00006 // (1) client request streams: 
00007 //     <time> <clientID> <serverID> <URL_ID> 
00008 // (2) server page mod stream(s):
00009 //     <serverID> <URL_ID> <PageSize>
00010 //
00011 // Part of the code comes from Steven Gribble's UCB trace parse codes
00012 // 
00013 // $Header: /nfs/jade/vint/CVSROOT/ns-2/indep-utils/webtrace-conv/epa/tr-stat.cc,v 1.3 2005/09/18 23:33:32 tomh Exp $
00014 
00015 #include <stdio.h>
00016 #include <stdlib.h>
00017 #include <string.h>
00018 #include <ctype.h>
00019 #include <time.h>
00020 #include <sys/types.h>
00021 #include <sys/socket.h>
00022 #include <netinet/in.h>
00023 #include <arpa/inet.h>
00024 #include <tcl.h>
00025 
00026 struct URL {
00027     URL(int i, int sd, int sz) : access(1), id(i), sid(sd), size(sz) {}
00028     int access; // access counts
00029     int id;
00030     int sid, size;
00031 };
00032 
00033 struct ReqLog {
00034     ReqLog() {}
00035     ReqLog(double t, unsigned int c, unsigned int s, unsigned int u) :
00036         time(t), cid(c), sid(s), url(u) {}
00037     double time;
00038     unsigned int cid, sid, url;
00039 };
00040 
00041 FILE *cf, *sf;
00042 double initTime = -1;
00043 double duration = -1;
00044 double startTime = -1;
00045 
00046 Tcl_HashTable cidHash;  // Client id (IP, port) hash
00047 static int client = 0;  // client sequence number
00048 
00049 static int server = 1;
00050 
00051 Tcl_HashTable urlHash;  // URL id hash
00052 static int url = 0; // URL sequence number
00053 static int* umap;   // URL mapping table, used for url sort
00054 
00055 ReqLog* rlog = NULL;
00056 unsigned int num_rlog = 0, sz_rlog = 0;
00057 
00058 struct Entry {
00059     char *client;
00060     unsigned int time;
00061     char *url;
00062     int size;
00063 };
00064 
00065 static int compare(const void *a1, const void *b1)
00066 {
00067     const ReqLog *a, *b;
00068     a = (const ReqLog*)a1, b = (const ReqLog*)b1;
00069     return (a->time > b->time) ? 1 : 
00070         (a->time == b->time) ? 0 : -1;
00071 }
00072 
00073 void sort_rlog()
00074 {
00075     qsort((void *)rlog, num_rlog, sizeof(ReqLog), compare);
00076     double t = rlog[0].time;
00077     for (unsigned int i = 0; i < num_rlog; i++) {
00078         rlog[i].time -= t;
00079         fprintf(cf, "%f %d %d %d\n", rlog[i].time, 
00080             rlog[i].cid, rlog[i].sid, umap[rlog[i].url]);
00081     }
00082     // Record trace duration and # of unique urls
00083     fprintf(cf, "i %f %u\n", rlog[num_rlog-1].time, url);
00084 
00085     fprintf(stderr, 
00086         "%d unique clients, %d unique servers, %d unique urls.\n", 
00087         client, server, url);
00088 }
00089 
00090 static int compare_url(const void* a1, const void* b1)
00091 {
00092     const URL **a, **b;
00093     a = (const URL**)a1, b = (const URL**)b1;
00094     return ((*a)->access > (*b)->access) ? -1:
00095         ((*a)->access == (*b)->access) ? 0 : 1;
00096 }
00097 
00098 void sort_url()
00099 {
00100     // XXX use an interval member of Tcl_HashTable
00101     URL** tbl = new URL*[urlHash.numEntries];
00102     Tcl_HashEntry *he;
00103     Tcl_HashSearch hs;
00104     int i = 0, sz = urlHash.numEntries;
00105     for (he = Tcl_FirstHashEntry(&urlHash, &hs);
00106          he != NULL;
00107          he = Tcl_NextHashEntry(&hs))
00108         tbl[i++] = (URL*)Tcl_GetHashValue(he);
00109     Tcl_DeleteHashTable(&urlHash);
00110 
00111     // sort using access frequencies
00112     qsort((void *)tbl, sz, sizeof(URL*), compare_url);
00113     umap = new int[url];
00114     // write sorted url to page table
00115     for (i = 0; i < sz; i++) {
00116         umap[tbl[i]->id] = i;
00117         fprintf(sf, "%d %d %d %u\n", tbl[i]->sid, i,
00118             tbl[i]->size, tbl[i]->access);
00119         delete tbl[i];
00120     }
00121     delete []tbl;
00122 }
00123 
00124 double lf_analyze(Entry& lfe)
00125 {
00126     double time;
00127     int ne, cid, sid, uid;
00128     Tcl_HashEntry *he;
00129 
00130     time = (double)lfe.time;
00131 
00132     if (initTime < 0) {
00133         initTime = time;
00134         time = 0;
00135     } else 
00136         time -= initTime;
00137 
00138     // If a trace start time is required, don't do anything
00139     if ((startTime > 0) && (time < startTime)) 
00140         return -1;
00141 
00142     // If page size is 0, ignore it
00143     if (lfe.size == 0) 
00144         return -1;
00145 
00146     // check client id
00147     if (!(he = Tcl_FindHashEntry(&cidHash, (const char *)lfe.client))) {
00148         // new client, allocate a client id
00149         he = Tcl_CreateHashEntry(&cidHash, (const char *)lfe.client, &ne);
00150         client++;
00151         long clientValue = client;
00152         Tcl_SetHashValue(he, clientValue);
00153         cid = client;
00154     } else {
00155         // existing entry, find its client seqno
00156         cid = (long)Tcl_GetHashValue(he);
00157     }
00158 
00159     // only a single server for EPA trace
00160     sid = 0;
00161 
00162     // check url id
00163     if (!(he = Tcl_FindHashEntry(&urlHash, (const char*)lfe.url))) {
00164         // new client, allocate a client id
00165         he = Tcl_CreateHashEntry(&urlHash, (const char*)lfe.url, &ne);
00166         URL* u = new URL(++url, sid, lfe.size);
00167         Tcl_SetHashValue(he, (const char*)u);
00168         uid = u->id;
00169     } else {
00170         // existing entry, find its client seqno
00171         URL* u = (URL*)Tcl_GetHashValue(he);
00172         u->access++;
00173         uid = u->id;
00174     }
00175 
00176     rlog[num_rlog++] = ReqLog(time, cid, sid, uid);
00177     //fprintf(cf, "%f %d %d %d\n", time, cid, sid, uid);
00178 
00179     if (startTime > 0) 
00180         return time - startTime;
00181     else 
00182         return time;
00183 }
00184 
00185 int get_next_entry(Entry& lfe) 
00186 {
00187     char buf[1024];
00188 
00189     if (feof(stdin)) 
00190         return 0;
00191 
00192     fgets(buf, 1024, stdin);
00193     if (feof(stdin) || ferror(stdin))
00194         return 0;
00195 
00196     char *tmp = buf, *code, *method, *date;
00197     lfe.client = strtok(tmp, " ");
00198     date = strtok(NULL, " "); 
00199     method = strtok(NULL, " ");     // GET/POST
00200     *(method++) = 0;
00201     if (strcmp(method, "GET") != 0) 
00202         // Only take GET requests
00203         return -1;
00204 
00205     lfe.url = strtok(NULL, " "); 
00206     if (strchr(lfe.url, '?') != NULL) 
00207         // Do not take any url that contains '?'
00208         return -1;
00209     strtok(NULL, " ");      // HTTP/1.0
00210     code = strtok(NULL, " ");   // return code
00211     if ((atoi(code) != 200) && (atoi(code) != 304)) 
00212         return -1;
00213     // last element: size
00214     tmp = strtok(NULL, " ");
00215     lfe.size = atoi(tmp);
00216 
00217     // parse date
00218     // date is from internal string of strtok(), we have to copy it. 
00219     // What a stupid strtok()!!!!
00220     tmp = new char[strlen(date)+1];
00221     strcpy(tmp, date);  
00222     date = tmp + 1;
00223     lfe.time = 0;
00224     date = strtok(date, ":"); // day
00225     lfe.time = atoi(date);
00226     date = strtok(NULL, ":"); // hour
00227     lfe.time = lfe.time*24 + atoi(date);
00228     date = strtok(NULL, ":"); // minute
00229     lfe.time = lfe.time*60 + atoi(date);
00230     date = strtok(NULL, "]");
00231     lfe.time = lfe.time*60 + atoi(date);
00232     delete []tmp;
00233 
00234     return 1;
00235 }
00236 
00237 int main(int argc, char**argv)
00238 {
00239     Entry lfntree;
00240     int      ret;
00241     double   ctime;
00242 
00243     // Init tcl
00244     Tcl_Interp *interp = Tcl_CreateInterp();
00245     if (Tcl_Init(interp) == TCL_ERROR) {
00246         printf("%s\n", interp->result);
00247         abort();
00248     }
00249     Tcl_InitHashTable(&cidHash, TCL_STRING_KEYS);
00250     Tcl_InitHashTable(&urlHash, TCL_STRING_KEYS);
00251 
00252     if ((cf = fopen("reqlog", "w")) == NULL) {
00253         printf("cannot open request log.\n");
00254         exit(1);
00255     }
00256     if ((sf = fopen("pglog", "w")) == NULL) {
00257         printf("cannot open page log.\n");
00258         exit(1);
00259     }
00260 
00261     if ((argc > 4) || (argc < 2)) {
00262         printf("Usage: %s <trace size> [<time duration>] [<start_time>]\n", argv[0]);
00263         return 1;
00264     }
00265     if (argc >= 3) {
00266         duration = strtod(argv[2], NULL);
00267         if (argc == 4) {
00268             startTime = strtod(argv[3], NULL);
00269             printf("start time = %f\n", startTime);
00270         }
00271     }
00272 
00273     sz_rlog = strtoul(argv[1], NULL, 10);
00274     rlog = new ReqLog[sz_rlog];
00275 
00276     while((ret = get_next_entry(lfntree)) != 0) {
00277         // Analyse one log entry
00278         if (ret < 0)
00279             continue;
00280         ctime = lf_analyze(lfntree);
00281         if ((duration > 0) && (ctime > duration))
00282             break;
00283     }
00284     Tcl_DeleteHashTable(&cidHash);
00285 
00286     fprintf(stderr, "sort url\n");
00287     sort_url();
00288     fclose(sf);
00289 
00290     fprintf(stderr, "sort requests\n");
00291     sort_rlog();
00292     fclose(cf);
00293 
00294     fprintf(stderr, 
00295         "%d unique clients, %d unique servers, %d unique urls.\n", 
00296         client, server, url);
00297     return 0;
00298 }

Generated on Tue Mar 6 16:47:53 2007 for ns2 Network Simulator 2.29 by  doxygen 1.4.6