#include <stdio.h>
#include <stdlib.h>
#include <ctype.h>
#include <string.h>

#include "web.h"

const char * getxreffield(const char * tag)
{
    if (!strcmp(tag, "A"))
	return "HREF";
    else if (!strcmp(tag, "IMG"))
	return "SRC";
    else if (!strcmp(tag, "FRAME"))
	return "SRC";
    else if (!strcmp(tag, "BODY"))
	return "BACKGROUND";
    else if (!strcmp(tag, "LINK"))
	return "HREF";
    else if (!strcmp(tag, "LAYER"))
	return "SRC";
    else
	return NULL;
    
}

#define getpnext (p++ + (index++ * 0)) /* return p, postincrement index & p */
#define nextp p++, index++

int getxref(FILE * fp, xreflist * xrefs)
{
    int incomment = 0;		/* 1 if in a comment */
    int intag = 0;		/* 1 if in a tag */
    int intagname = -1;		/* number of characters in if in tag name,
				   0 if before tag name */
    char tagnamebuf[80];

    int lookxref = 0;		/* 1 if in a tag which contains xrefs */
    const char * xreffield;	/* if lookxref, the string that starts
				   a reference */
    int matchedfield = 0;

    long size;
    int inquotes = 0;
    int foundxref = 0;
    int inxref = 0;
    int wspacecount = 0;

    char * buf;
    char * p, *xrefstart;
    
    int index = 0, nxrefstart;

    xrefs -> nrefs = 0;

    fseek(fp, 0, SEEK_END);
    buf = malloc((size = ftell(fp)) + 1);
    if (! buf) {
	fprintf(stderr, "malloc failed\n");
	return 1;
    }
    fseek(fp, 0, SEEK_SET);
    fread(buf, 1, size, fp);
    buf[size] = 0;

    p = buf;
    while (*p)
    {
	if (incomment) {
	    if (!strncmp(p, "-->", 3)) {
		incomment = 0;
	    }
	    nextp;
	    continue;
	}

	if (! intag) {
	    if (!strncmp(p, "<!--", 4)) {
		incomment = 1;
		nextp;
		continue;
	    }
	    if (*p == '<') {
		intag = 1;
		intagname = 0;
		inquotes = 0;
		lookxref = 0;
		foundxref = 0;
	    }
	    nextp;
	    continue;	/* we aren't interested in stuff outside tags */
	}
	if (*p == '"' || (inquotes && *p == '\n')) inquotes = !inquotes;
	if (inquotes && ! (inxref || foundxref)) {
	    nextp;
	    continue;	/* xref's can't start in quotes */
	}
	if (!inquotes && *p == '>') {
	    intag = 0;
	    if (! inxref)
	    {
		nextp;
		continue;
	    }
	}

	if (intagname != -1) {
	    if (iswhitespace(*p)) {
		if (intagname == 0) {
		    nextp;
		    continue; /* space at start of tag name */
		}
		tagnamebuf[intagname] = 0;
		intagname = -1;	/* not in tag name any more */
		
		xreffield = getxreffield(tagnamebuf);
		if (xreffield) {
		    lookxref = 1;
		    matchedfield = 0;
		}
		else
		    lookxref = 0;
		nextp;
		continue;
	    }
	    
	    tagnamebuf[intagname++] = toupper(*getpnext);
	    continue;
	}

	if (lookxref && ! inquotes) {
	    if (! xreffield[matchedfield]) /* found field... start looking */
	    {				   /* for ref */
		lookxref = 0;
		foundxref = 1;
		wspacecount = 0;
		continue;
	    }

	    if (matchedfield == 0 && !iswhitespace(p[-1])) {
		nextp;
		continue;	/* field may only start after whitespace */
	    }
	    if (xreffield[matchedfield] == toupper(*getpnext))
	    {
		matchedfield++;
		continue;
	    }
	    else
	    {
		matchedfield = 0;
		continue;
	    }
	}

	if (foundxref)
	{
	    if (iswhitespace(*p) || *p == '=' || *p == '"') {
		nextp;
		wspacecount++;
		continue;
	    }
	    else
	    {
		if (wspacecount == 0) /* not the start of the text */
		{
		    foundxref = 0;
		    lookxref = 1;
		    nextp;
		    continue;
		}
		xrefstart = p;
		nxrefstart = index;
		foundxref = 0;
		inxref = 1;
		continue;	/* don't increment p in case first char
				   is a #, in which case, the url is empty! */
	    }
	}

	if (inxref)
	{
	    if ((!inquotes && (iswhitespace(*p) || *p == '"' || *p == '>'))
		|| *p == '#') /* # terminates URL even in strings! */
	    {
		*p = 0; /* terminate string */
		if (p != xrefstart) {
		    xrefs->startloc[xrefs->nrefs] = nxrefstart;
		    xrefs->endloc[xrefs->nrefs] = index;
		    xrefs->alwaysget[xrefs->nrefs] = !strcmp(tagnamebuf,"IMG");
		    xrefs->refs[xrefs->nrefs++] = strdup(xrefstart);

		    if (xrefs->nrefs == 200) {
			free(buf);
			fprintf(stderr, "warning: xref buffer filled\n");
			return 0;
		    }
		}
		nextp;
		inxref = 0;
		continue;
	    }
	}
		
	nextp;
    }
    free(buf);
    return 0;
}


