/* inpaths.c -- track the paths of incoming news articles and prepare
 *	      in a format suitable for decwrl pathsurveys
 *
 *
 * This program inputs a list of filenames of news articles, and outputs a
 * data report which should be mailed to the decwrl Network Monitoring
 * Project at address "pathsurvey@decwrl.dec.com". Please run it once a month
 * if you can, in time so that the results arrive at decwrl by the 1st
 * day of the month.
 *
 *
 * Run it like this:
 *
 *  cd /usr/spool/news
 *  find . -type f -print | inpaths "yourhost" | mail pathsurvey@decwrl.dec.com
 *
 *  where "yourhost" is the host name of your computer, e.g. "decwrl".
 *
 * The input to "inpaths" must be a list of the file names of news articles,
 * relative to the spooling directory. "./news/config/2771" and
 * "news/config/2771" are both legal inputs, but "/usr/spool/news/config/2771"
 * is not.  * If you have some other way of generating a list of news file
 * names, such as running a script over the history file, you can use that
 * instead. Inpaths handles crossposting regardless of which technique
 * you use.
 *
 * If you get an error message "no traffic found. Check $CWD", then the
 * problem is most likely that the path names you are giving it are not
 * relative to the spooling directory, e.g. you are feeding it lines like
 * "/usr/spool/news/news/config/2771" instead of "./news/config/2771"
 * 
 * There are 3 options: -s, -m, and -l for short, medium, and long report.
 * The default is to produce a long report. If you are worried about mail
 * expenses you can send a shorter report. The long report is typically
 * about 50K bytes for a major site, and perhaps 25K bytes for a smaller
 * site. 
 *
 * Brian Reid
 *	V1	 Sep 1986
 *	V2.4	 May 1989
 *	V2.4.1	 Jan 1992
 *
 * Special thanks to Mel Pleasant and Bob Thrush for significant help with
 * portability bugs.
 *     
 */

/* if you are compiling on a USG machine (SysV, etc),
   please uncomment the following line: */

/* #define SYSV			*/



#define VERSION "2.4"
#include <stdio.h>
#include <fcntl.h>
#include <ctype.h>
#include <sys/types.h>
#include <sys/stat.h>

#define HEADBYTES 1024

#ifdef SYSV
    long time();
#else
    time_t time();
#endif

extern int exit();
extern char *malloc();
extern char *strcpy();

/* this is index() or strchr() included here for portability */

char *index(ptr,chr)
char *ptr,chr;
 {
    do {if (*ptr==chr) return(ptr);} while (*ptr++);
    return ( (char *) NULL);
 }

main (argc,argv)
  int argc;
  char **argv;
 {
    char linebuf[1024], jc, *lptr, *cp, *cp1, *cp2;
    char rightdelim;
    char *pathfield, *groupsfield;
    int crossposted;
    char artbuf[HEADBYTES], ngfilename[256];
    struct stat statbuf, *sbptr;
    char *scanlimit;
    char *hostname;
    char hostString[128];
    int needHost;
    static int passChar[256];
    int isopen,columns,verbose,totalTraffic;
    long nowtime,age,agesum;
    float avgAge;

	/* definitions for getopt */
    extern int optind;
    extern char *optarg;

 /* structure used to tally the traffic between two hosts */
    struct trec {
	struct trec *rlink;
	struct nrec *linkid;
	int tally;
    } ;

 /* structure to hold the information about a host */
    struct nrec {
	struct nrec *link;
	struct trec *rlink;
	char *id;
	long sentto; /* tally of articles sent to somebody from here */
    } ;
    struct nrec *hosthash[128], *hnptr, *list, *relay;
    struct trec *rlist;
    int i, article, gotbytes, c;

    hostname = "unknown";
    verbose = 2;
    while (( c=getopt(argc, argv, "sml" )) != EOF)
    switch (c) {
	case 's': verbose=0; break;
	case 'm': verbose=1; break;
	case 'l': verbose=2; break;
	case '?': fprintf(stderr,
	"usage: %s [-s] [-m] [-l] hostname\n",argv[0]);
	exit(1);
    }
    if (optind < argc) {
        hostname = argv[optind];
    } else {
	fprintf(stderr,"usage: %s [-s] [-m] [-l] `hostname`\n",argv[0]);
	exit(1);
    }

    if (isatty(fileno(stderr))) {
	fprintf(stderr,"computing %s inpaths for host %s\n",
	    verbose==0 ? "short" : (verbose==1 ? "medium" : "long"),hostname);
    }

    for (i = 0; i<128; i++) hosthash[i] = (struct nrec *) NULL;

/* precompute character types to speed up scan */
    for (i = 0; i<=255; i++) {
    	passChar[i] = 0;
	if (isalpha(i) || isdigit(i)) passChar[i] = 1;
	if (i == '-' || i == '.' || i == '_') passChar[i] = 1;
    }
    totalTraffic = 0;
    nowtime = (long) time(0L);
    agesum = 0;

    while (gets(linebuf) != (char *) NULL) {
        lptr = linebuf;
	isopen = 0;

/* Skip blank lines */
	if (linebuf[0] == '\0') goto bypass;

/* Skip files that do not have pure numeric names */
	i = strlen(lptr)-1;
	do {
	    if (!isdigit(linebuf[i])) {
	        if (linebuf[i]=='/') break;
		goto bypass;
	    }
	    i--;
	} while (i>=0);

/* Open the file for reading */
	article = open(lptr, O_RDONLY);
	isopen = (article > 0);
	if (!isopen) goto bypass;
	sbptr = &statbuf;
	if (fstat(article, sbptr) == 0) {

/* Record age of file in hours */
	    age = (nowtime - statbuf.st_mtime) / 3600;
	    agesum += age;
/* Reject names that are not ordinary files (code fix from Pauyl Eggert) */
#ifdef S_ISREG
            if (! S_ISREG(statbuf.st_mode)) goto bypass;
#else
            if ((statbuf.st_mode & S_IFREG) == 0) goto bypass;
#endif

/* Pick the file name apart into an equivalent newsgroup name */
	    if (*lptr == '.') {
	        lptr++;
		if (*lptr == '/') lptr++;
	    }
	    cp = ngfilename;
	    while (*lptr != 0) {
	        if (*lptr == '/') *cp++ = '.';
		else *cp++ = *lptr;
		lptr++;
	    }
	    cp--; while (isdigit(*cp)) *cp-- = NULL;
	    if (*cp == '.') *cp = NULL;
        } else goto bypass;

/* Read in the first few bytes of the article; find the end of the header */
	gotbytes = read(article, artbuf, HEADBYTES);
	if (gotbytes < 10) goto bypass;

/* Find "Path:" header field */
	pathfield = (char *) 0;
	    groupsfield = (char *) 0;
	scanlimit = &artbuf[gotbytes];
	for (cp=artbuf; cp <= scanlimit; cp++) {
	    if (*cp == '\n') break;
	    if (pathfield && groupsfield) goto gotpath;
	    if (strncmp(cp, "Path: ", 6) == 0) {
		pathfield = cp; goto nextgr;
	    }
	    if (strncmp(cp, "Newsgroups: ", 12) == 0) {
		groupsfield = cp; goto nextgr;
	    }
   nextgr:
	    while (*cp != '\n' && cp <= scanlimit) cp++;
	}
	if (groupsfield == (char *) 0 || (pathfield == (char *) 0)) 
	    goto bypass; 

gotpath: ;

/* Determine the name of the newsgroup to which this is charged. It is not
   necessarily the name of the file in which we found it; rather, use the
   "Newsgroups:" field.							 */

	crossposted = 0;
	groupsfield += 12;	/* skip 'Newsgroups: ' */
	while (*groupsfield == ' ') groupsfield++;
	cp= (char *) index(groupsfield,'\n');
	if (cp) {
	    *cp = 0;
	}  else {
 /* if this field is malformed, there is no point trying to process the
    entire message.
  */
	    goto bypass;
	}
	cp=(char *) index(groupsfield,',');
	if (cp) {
	    crossposted++;
	    *cp = 0;
	}
/* To avoid double-billing, only charge the newsgroup if the pathname matches
   the contents of the Newsgroups: field. This will also prevent picking up
   junk and control messages.
 */
	if (strcmp(ngfilename,groupsfield)) goto bypass;

/* Extract all of the host names from the "Path:" field and put them in our
host table.								 */
	cp = pathfield;
	while (*cp != NULL && *cp != '\n') cp++;
	if (cp == NULL) {
	    fprintf(stderr,"%s: end of Path line not in buffer.\n",lptr);
	    goto bypass;
	}

	totalTraffic++;
	*cp = 0;
	pathfield += 5;	/* skip 'Path:' */
	cp1 = pathfield;
	relay = (struct nrec *) NULL;
	rightdelim = '!';
	while (cp1 < cp) {
	    /* get next field */
	    while (*cp1=='!') cp1++;
	    cp2 = ++cp1;
	    while (passChar[(int) (*cp2)]) cp2++;

	    rightdelim = *cp2; *cp2 = 0;
	    if (rightdelim=='!' && *cp1 != (char) NULL) {
	    /* see if already in the table */
		list = hosthash[*cp1];
		while (list != NULL) {
		    /*
		     * Attempt to speed things up here a bit.  Since we hash
		     * on the first char, we see if the second char is a match
		     * before calling strcmp()
		     */
		    if (list->id[1] == cp1[1] && !strcmp(list->id, cp1)) {
			hnptr = list;
			break;		/* I hate unnecessary goto's */
		    }
		    list = list->link;
		}
		if(list == NULL) {
			/* get storage and splice in a new one */
			hnptr = (struct nrec *) malloc(sizeof (struct nrec));
			hnptr->id = (char *) strcpy(malloc(1+strlen(cp1)),cp1);
			hnptr->link = hosthash[*cp1];
			hnptr->rlink = (struct trec *) NULL;
			hnptr->sentto = (long) 0;
			hosthash[*cp1] = hnptr;
		}
	    }
/* 
At this point "hnptr" points to the host record of the current host. If
there was a relay host, then "relay" points to its host record (the relay
host is just the previous host on the Path: line. Since this Path means
that news has flowed from host "hnptr" to host "relay", we want to tally
one message in a data structure corresponding to that link. We will
increment the tally record that is attached to the source host "hnptr".
*/

	    if (relay != NULL && relay != hnptr) {
		rlist = relay->rlink;
		while (rlist != NULL) {
		    if (rlist->linkid == hnptr) goto have2;
		    rlist = rlist->rlink;
		}
		rlist = (struct trec *) malloc(sizeof (struct trec));
		rlist->rlink = relay->rlink;
		relay->rlink = rlist;
		rlist->linkid = hnptr;
		rlist->tally = 0;

    have2:      rlist->tally++;
		hnptr->sentto++;
	    }

	    cp1 = cp2;
	    relay = hnptr;
	    if (rightdelim == ' ' || rightdelim == '(') break;
	}
bypass: if (isopen) close(article) ;
    }
/* Now dump the host table */
    if (!totalTraffic) {
	fprintf(stderr,"%s: error--no traffic found. Check $CWD.\n",argv[0]);
	exit(1);
    }

    avgAge = ((double) agesum) / (24.0*(double) totalTraffic);
    printf("ZCZC begin inhosts %s %s %d %d %3.1f\n",
    	VERSION,hostname,verbose,totalTraffic,avgAge);
    for (jc=0; jc<127; jc++) {
	list = hosthash[jc];
	while (list != NULL) {
	    if (list->rlink != NULL) {
		if (verbose > 0 || (100*list->sentto > totalTraffic))
		    printf("%ld\t%s\n",list->sentto, list->id);
	    }
	    list = list->link;
	}
    }
    printf("ZCZC end inhosts %s\n",hostname);

    printf("ZCZC begin inpaths %s %s %d %d %3.1f\n",
        VERSION,hostname,verbose,totalTraffic,avgAge);
    for (jc=0; jc<127; jc++) {
	list = hosthash[jc];
	while (list != NULL) {
	    if (verbose > 1 || (100*list->sentto > totalTraffic)) {
		if (list->rlink != NULL) {
		    columns = 3+strlen(list->id);
		    sprintf(hostString,"%s H ",list->id);
		    needHost = 1;
		    rlist = list->rlink;
		    while (rlist != NULL) {
		        if (
			     (100*rlist->tally > totalTraffic)
			  || ((verbose > 1)&&(5000*rlist->tally>totalTraffic))
			   ) {
			    if (needHost) printf("%s",hostString);
			    needHost = 0;
			    relay = rlist->linkid;
			    if (columns > 70) {
				printf("\n%s",hostString);
				columns = 3+strlen(list->id);
			    }
			    printf("%d Z %s U ", rlist->tally, relay->id);
			    columns += 9+strlen(relay->id);
			}
			rlist = rlist->rlink;
		    }
		    if (!needHost) printf("\n");
		}
	    }
	    list = list->link;
	}
    }
    printf("ZCZC end inpaths %s\n",hostname);
    fclose(stdout);
    exit(0);
}
