/* rbhtml.c
 *
 * These routines interface with libxml's HTML parser.  There is a data
 * pushing routine for sending data to the parser (as well as one to flush
 * the data at the end), but the majority of the file is dedicated to the
 * call-back functions that allow us to transform the SAX events into the
 * HTML-subset that the Rocket eBook can use (as well as noting all the
 * necessary information for creating the associated .hidx page and (if
 * appropriate) the associated .hkey page.
 */
/* This software is copyrighted as detailed in the LICENSE file. */

#include <config.h>
#include <stdarg.h>
#include <libxml/xmlerror.h>
#include <libxml/encoding.h>
#include <ctype.h>
#include <rbmake/rbmake.h>
#include "rbmake.h"
#include "rbpage.h"
#include "rbhtml.h"
#include "mbuf.h"
#include "tags.h"

#define RB_PARSE_CHUNK_SIZE (512*3)

TagInfo tagInfo[] = {
 {"A",		TAG_A,		RB_IN_SUBSET },
 {"B",		TAG_B,		RB_IN_SUBSET		    | RB_NOTE_TAG },
 {"BIG",	TAG_BIG,	RB_IN_SUBSET		    | RB_NOTE_TAG },
 {"BLOCKQUOTE", TAG_BLOCKQUOTE, RB_IN_SUBSET | RB_NOTE_PARA | RB_NOTE_TAG },
 {"BODY",	TAG_BODY,	RB_IN_SUBSET		    | RB_NOTE_TAG | RB_TRIM_WS },
 {"BR", 	TAG_BR,		RB_IN_SUBSET | RB_NOTE_PARA | RB_NOTE_TAG | RB_TRIM_WS | RB_NO_CLOSE_TAG },
 {"CENTER",	TAG_CENTER,	RB_IN_SUBSET | RB_NOTE_PARA | RB_NOTE_TAG },
 {"CODE",	TAG_CODE,	RB_IN_SUBSET		    | RB_NOTE_TAG },
 {"DIV",	TAG_DIV,	RB_IN_SUBSET | RB_NOTE_PARA | RB_NOTE_TAG },
 {"H1", 	TAG_H1,		RB_IN_SUBSET | RB_NOTE_PARA | RB_NOTE_TAG },
 {"H2", 	TAG_H2,		RB_IN_SUBSET | RB_NOTE_PARA | RB_NOTE_TAG },
 {"H3", 	TAG_H3,		RB_IN_SUBSET | RB_NOTE_PARA | RB_NOTE_TAG },
 {"H4", 	TAG_H4,		RB_IN_SUBSET | RB_NOTE_PARA | RB_NOTE_TAG },
 {"H5", 	TAG_H5,		RB_IN_SUBSET | RB_NOTE_PARA | RB_NOTE_TAG },
 {"H6", 	TAG_H6,		RB_IN_SUBSET | RB_NOTE_PARA | RB_NOTE_TAG },
 {"HEAD",	TAG_HEAD,	RB_IN_SUBSET },
 {"HR", 	TAG_HR,		RB_IN_SUBSET | RB_NOTE_PARA                            | RB_NO_CLOSE_TAG },
 {"HTML",	TAG_HTML,	RB_IN_SUBSET		    | RB_NOTE_TAG },
 {"I",		TAG_I,		RB_IN_SUBSET		    | RB_NOTE_TAG },
 {"IMG",	TAG_IMG,	RB_IN_SUBSET				  | RB_TRIM_WS | RB_NO_CLOSE_TAG },
 {"LI", 	TAG_LI,		RB_IN_SUBSET },
 {"META",	TAG_META,	RB_IN_SUBSET                                           | RB_NO_CLOSE_TAG },
 {"OL", 	TAG_OL,		RB_IN_SUBSET | RB_NOTE_PARA | RB_NOTE_TAG },
 {"P",		TAG_P,		RB_IN_SUBSET | RB_NOTE_PARA | RB_NOTE_TAG },
 {"PRE",	TAG_PRE,	RB_IN_SUBSET | RB_NOTE_PARA | RB_NOTE_TAG },
 {"SMALL",	TAG_SMALL,	RB_IN_SUBSET		    | RB_NOTE_TAG },
 {"SUB",	TAG_SUB,	RB_IN_SUBSET },
 {"SUP",	TAG_SUP,	RB_IN_SUBSET },
 {"TITLE",	TAG_TITLE,	RB_IN_SUBSET },
 {"TT", 	TAG_TT,		RB_IN_SUBSET		    | RB_NOTE_TAG },
 {"UL", 	TAG_UL,		RB_IN_SUBSET | RB_NOTE_PARA | RB_NOTE_TAG },

 /* These tags are not supported, but we'll include them anyway */
 {"S",		TAG_S,		RB_IN_SUBSET },
 {"U",		TAG_U,		RB_IN_SUBSET },

 /* These tags are mapped into other tags */
 {"FONT", 	TAG_FONT,	0 },
 {"DL", 	TAG_DL,		0 },
 {"DT", 	TAG_DT,		0 },
 {"DD", 	TAG_DD,		0 },
 {"EM", 	TAG_EM,		0 },
 {"STRONG",	TAG_STRONG,	0 },
 {"TABLE",	TAG_TABLE,	0 },
 {"TR", 	TAG_TR,		0 },
 {"TH", 	TAG_TH,		0 },
 {"TD", 	TAG_TD,		0 },

 /* These tags (and all their contents) are completely discarded */
 {"FRAMESET",	TAG_FRAMESET,	RB_DISCARD_CONTAINER },
 {"SCRIPT",	TAG_SCRIPT,	RB_DISCARD_CONTAINER },
 {"STYLE",	TAG_STYLE,	RB_DISCARD_CONTAINER },
 {"NOFRAMES",	TAG_NOFRAMES,	RB_DISCARD_CONTAINER },
 {"NOSCRIPT",	TAG_NOSCRIPT,	RB_DISCARD_CONTAINER },

 { NULL,	0,		0 }
};

/* Some meta names get turned into infoHash items, one gets special
 * processing (the "" item), and one gets dropped (the NULL). */
static const char *metaNames[] = {
    "author", "AUTHOR",
    "genre", "GENRE",
    "isbn", "ISBN",
    "publisher", "PUB_NAME",
    "rocket-menu", "",
    "generator", NULL,
    NULL
};

static HashTable *tagHash;
static MBuf *tmpBuf, *urlBuf, *tagBuf, *warnBuf;

static bool UTF8ToMBuf(MBuf *mb, const char *f, int flen, char quoteChar,
		       bool urlEscape, RbPage *pg);
static bool checkDelayedChars(MBuf *mb, RbPage *pg, unsigned int u);
static TagTree *noteTag(RbPage *pg, int elnum, int align);
static void unbookify_paragraph(RbPage *pg);
static int bumpHidxNames(void *userPtr, const char *key, void *obj);
static void parseHKeyItem(RbPage *pg);
static char *urlEscapeChar(int ch);
static void removeAmpersandEntities(char *bp);
static void cacheXmlError(void *ctx, const char *fmt, ...);
static void vcacheXmlError(const char *fmt, va_list args);

void
RbHtml_init()
{
    if (!tagHash) {
	char *t, *f;
	int i;
	char buf[64];
	tagHash = HashTable_new(101, false, false);
	for (i = 0; tagInfo[i].tag; i++) {
	    for (t = buf, f = tagInfo[i].tag; *f; f++)
		*t++ = ISUPPER(*f)? TOLOWER(*f) : *f;
	    *t = '\0';
	    HashTable_store(tagHash, buf, (void*)&tagInfo[i]);
	}
	tmpBuf = MBuf_new(64, 0);
	urlBuf = MBuf_new(64, 0);
	tagBuf = MBuf_new(64, 0);
	warnBuf = MBuf_new(1024, 0);
	xmlSetGenericErrorFunc(NULL, cacheXmlError);
    }
}

void
RbHtml_cleanup()
{
    if (tagHash) {
	MBuf_delete(tmpBuf);
	MBuf_delete(urlBuf);
	MBuf_delete(tagBuf);
	HashTable_delete(tagHash);
	tagHash = NULL;
    }
}

/* This routine outputs the ' attr="value"' part of a tag.  If called with
 * a null attribute name, only the value is output (without any quotes). */
static void
putTagAttrValue(MBuf *mb, const char *att, const char *val, bool urlEscape)
{
    if (att)
	MBuf_vwrite(mb, " ", 1, att, -1, "=\"", 2, NULL);
    if (val) {
	int len = strlen(val);
	if (urlEscape)
	    while (len && ISSPACE(val[len-1])) len--;
	UTF8ToMBuf(mb, val, len, '"', urlEscape, NULL);
    }
    if (att)
	MBuf_putc(mb, '"');
}

static void
rbStartElement(void *userPtr, const xmlChar *fullnameX, const xmlChar **attsX)
{
    RbPage *pg = (RbPage*)userPtr;
    const char *fullname = (const char*)fullnameX;
    const char **atts = (const char**)attsX;
    RbMake *rb = pg->rb;
    htmlParserCtxtPtr ctxt = pg->ctxt;
    char *attrStr = NULL, *extraStr = NULL;
    int align, overrideParaPos = 0;
    bool dropTag = false;
    TagInfo *ti;
    TagTree *tt = pg->tagTreePos;
    const char *id, **cpp;

    if (pg->discardPage || !(ti = HashTable_fetch(tagHash, fullname)))
	return;

    pg->paraContentCnt++;
    align = TT_ALIGN_NONE;
    id = NULL;
    if (atts) {
	for (cpp = atts; *cpp; cpp += 2) {
	    if (strcaseEQ(cpp[0], "align") && cpp[1]) {
		for (align = TT_ALIGN_LEFT; align > TT_ALIGN_NONE; align--) {
		    if (strcaseEQ(cpp[1], alignStrs[align]))
			break;
		}
	    }
	    else if (strcaseEQ(cpp[0], "id"))
		id = cpp[1];
	}
    }

    /* handle remapping tags and per-tag attributes */
    switch (ti->elnum) {
      case TAG_HTML:
	dropTag = pg->joinOrd > 1;
	align = TT_ALIGN_NONE;
	pg->parseFlags &= ~RBP_SPACE_OK_HERE;
	break;
      case TAG_HEAD:
	if (pg->joinOrd > 1 && !pg->discardHtmlLevel)
	    pg->discardHtmlLevel = ctxt->nameNr;
	align = TT_ALIGN_NONE;
	pg->parseFlags |= RBP_SPACE_OK_HERE;
	break;
      case TAG_TITLE:
	if (pg->firstPage)
	    pg->titlePos = pg->content->totalLen + 7;
	align = TT_ALIGN_NONE;
	pg->parseFlags &= ~RBP_SPACE_OK_HERE;
	break;
      case TAG_META:
	if (atts) {
	    const char *name = NULL, *content = NULL;
	    const char **tpp;

	    for (cpp = atts; *cpp; cpp += 2) {
		if (strcaseEQ(cpp[0], "name"))
		    name = cpp[1];
		else if (strcaseEQ(cpp[0], "content"))
		    content = cpp[1];
	    }
	    if (!name || !content)
		return;
	    for (tpp = metaNames; *tpp; tpp += 2) {
		if (strcaseEQ(name, *tpp)) {
		    if (!tpp[1] || (tpp[1][0] && !pg->firstPage))
			return;
		    break;
		}
	    }
	    MBuf_truncate(tmpBuf, 0);
	    if (!*tpp || tpp[1][0]) {
		putTagAttrValue(tmpBuf, "NAME", name, false);
		putTagAttrValue(tmpBuf, "CONTENT", content, false);

		if (*tpp && *content) {
		    char *val;
		    MBuf_truncate(urlBuf, 0);
		    UTF8ToMBuf(urlBuf, content,strlen(content), 0,false,NULL);
		    val = MBuf_dataPtr(urlBuf, NULL);
		    removeAmpersandEntities(val);
		    RbInfoHash_maybeStore(rb->infoHash, tpp[1], val);
		}
	    }
	    else { /* rocket-menu (items for the "Go To" menu) */
		char *desc, *href, *url, *tn, *eq = strchr(content, '=');
		int hrefPos, len;

		if (!eq || !eq[1])
		    return;
		*eq++ = '\0';

		MBuf_truncate(urlBuf, 0);
		UTF8ToMBuf(urlBuf, content,strlen(content), 0, false, NULL);
		MBuf_putc(urlBuf, '\0');
		hrefPos = urlBuf->totalLen;
		len = strlen(eq);
		while (len && ISSPACE(eq[len-1])) len--;
		UTF8ToMBuf(urlBuf, eq, len, 0, true, NULL);

		desc = MBuf_dataPtr(urlBuf, NULL);
		href = MBuf_dataPtrAt(urlBuf, hrefPos, NULL);

		if ((pg->tocFlags & RB_TOCFLAG_DEMANGLE)
		 && (*href == 'j' || *href == '#')) {
		    char *rel;
		    url = strchr(href, '#') + 1;
		    rel = strchr(url, RB_JOIN_NAME_SEP);
		    MBuf_truncate(tagBuf, 0);
		    if (rel) {
			MBuf_vwrite(tagBuf, url,rel-url, ".html",5, NULL);
			tn = strrchr(pg->url, '/') + 1;
			if (strEQ(tn, MBuf_dataPtr(tagBuf, NULL)))
			    MBuf_truncate(tagBuf, 0);
			MBuf_vwrite(tagBuf, "#",1, rel+1,-1, NULL);
		    }
		    else
			MBuf_vwrite(tagBuf, url,-1, ".html",5, NULL);
		    href = MBuf_dataPtr(tagBuf, NULL);
		}

		if (!(url = rbBuildURL(href, pg->url)))
		    tn = href;
		else
		    tn = RbPage_makeRbRef(pg, url);
		putTagAttrValue(tmpBuf, "NAME", name, false);
		MBuf_vwrite(tmpBuf, " CONTENT=\"",10, desc,-1, "=",1, tn,-1,
			    "\"",1, NULL);
		if (url) {
		    removeAmpersandEntities(desc);
		    RbMake_addMenuItem(rb, desc, url);
		    rbFreeURL(url);
		    Mem_free(tn);
		}
	    }
	    attrStr = MBuf_dataPtr(tmpBuf, NULL);
	    align = TT_ALIGN_NONE;
	    id = NULL;
	}
	else
	    return;
	pg->parseFlags |= RBP_SPACE_OK_HERE;
	break;
      case TAG_FRAMESET:
	/* Make a W.A.G. as to what name to visit from the frame pages? */
	/* FALL THROUGH */
      case TAG_BODY:
	if (pg->joinOrd > 1)
	    dropTag = true;
	/* Note: The page-joining code generates some extra content down
	 * near the bottom of this function. */
	align = TT_ALIGN_NONE;
	pg->parseFlags &= ~RBP_SPACE_OK_HERE;
	break;
      case TAG_PRE:
	pg->bookParaEndTagPos = 0;
	pg->includeRawWhitespaceLevel++;
	align = TT_ALIGN_NONE;
	pg->parseFlags |= RBP_SPACE_OK_HERE;
	break;
      case TAG_BIG:
	switch (pg->hkeyScanMode) {
	  case HKEY_INACTIVE:
	    break;
	  case HKEY_FIND_1ST_BIG:
	    if (pg->content->totalLen != pg->hkeyParaPos)
		pg->hkeyScanMode = HKEY_INACTIVE;
	    else
		pg->hkeyScanMode = HKEY_FIND_1ST_B;
	    break;
	  case HKEY_FIND_BIG_OR_SMALL:
	    pg->hkeyScanMode = HKEY_FIND_B_AFTER_BIG;
	    pg->hkeyWordPos = pg->content->totalLen + 5;
	    break;
	  default:
	    pg->hkeyScanMode = HKEY_INACTIVE;
	    break;
	}
	align = TT_ALIGN_NONE;
	pg->parseFlags |= RBP_SPACE_OK_HERE;
	break;
      case TAG_SMALL:
	switch (pg->hkeyScanMode) {
	  case HKEY_INACTIVE:
	    break;
	  case HKEY_FIND_BIG_OR_SMALL:
	    pg->hkeyScanMode = HKEY_FIND_B_AFTER_SMALL;
	    pg->hkeyWordPos = pg->content->totalLen + 7;
	    break;
	  default:
	    pg->hkeyScanMode = HKEY_INACTIVE;
	    break;
	}
	align = TT_ALIGN_NONE;
	pg->parseFlags |= RBP_SPACE_OK_HERE;
	break;
      case TAG_B:
	switch (pg->hkeyScanMode) {
	  case HKEY_INACTIVE:
	    break;
	  case HKEY_FIND_1ST_B:
	    if (pg->content->totalLen != pg->hkeyParaPos + 5)
		pg->hkeyScanMode = HKEY_INACTIVE;
	    else {
		pg->hkeyScanMode = HKEY_FIND_CLOSING_BIG;
		if (pg->bookParaStartTagPos)
		    unbookify_paragraph(pg);
		pg->hkeyWordPos = pg->content->totalLen + 3;
	    }
	    break;
	  case HKEY_FIND_B_AFTER_BIG:
	  case HKEY_FIND_B_AFTER_SMALL:
	    if (pg->content->totalLen != pg->hkeyWordPos)
		pg->hkeyScanMode = HKEY_INACTIVE;
	    else {
		pg->hkeyWordPos += 3;
		pg->hkeyScanMode++;
	    }
	    break;
	  case HKEY_FIND_1ST_BIG:
	    pg->hkeyScanMode = HKEY_INACTIVE;
	    break;
	}
	align = TT_ALIGN_NONE;
	pg->parseFlags |= RBP_SPACE_OK_HERE;
	break;
      case TAG_EM:
	ti = HashTable_fetch(tagHash, "i");
	align = TT_ALIGN_NONE;
	pg->parseFlags |= RBP_SPACE_OK_HERE;
	break;
      case TAG_STRONG:
	ti = HashTable_fetch(tagHash, "b");
	align = TT_ALIGN_NONE;
	pg->parseFlags |= RBP_SPACE_OK_HERE;
	break;
      case TAG_I:
      case TAG_U:
      case TAG_S:
      case TAG_TT:
      case TAG_CODE:
      case TAG_SUP:
      case TAG_SUB:
	align = TT_ALIGN_NONE;
	pg->parseFlags |= RBP_SPACE_OK_HERE;
	break;
      case TAG_P:
	if (rb->bookParagraphDepth && !(pg->tocFlags & RB_TOCFLAG_MENUMARK_FILE)
	 && ctxt->nameNr == rb->bookParagraphDepth) {
	    switch (align) {
	      case TT_ALIGN_NONE:
	      case TT_ALIGN_LEFT:
	      case TT_ALIGN_JUSTIFY:
		pg->bookParaStartTagPos = pg->content->totalLen;
		if (pg->bookParaEndTagPos && align == pg->bookParaPTag->align) {
		    pg->tagTreePos = pg->bookParaPTag;
		    ti = HashTable_fetch(tagHash, "br");
		    align = TT_ALIGN_NONE;
		    overrideParaPos = pg->bookParaEndTagPos + 4;
		    /* Use lower-case "br" to signal rbburst to undo this. */
		    MBuf_overwrite(pg->content, overrideParaPos - 3, "br", 2);
		    dropTag = 1;
		}
		pg->paraContentCnt = 0;
		extraStr = INDENT_STR;
		break;
	      default:
		pg->bookParaEndTagPos = 0;
		break;
	    }
	}
	if (pg->keys) {
	    pg->hkeyParaPos = pg->content->totalLen + 3;
	    if (pg->bookParaStartTagPos)
		pg->hkeyParaPos += STATICLEN(INDENT_STR) - (dropTag? 3 : 0);
	    pg->hkeyScanMode = HKEY_FIND_1ST_BIG;
	}
	pg->dquote_level = 0;
	pg->parseFlags &= ~RBP_SPACE_OK_HERE;
	break;
      case TAG_BR:
	if (pg->bookParaStartTagPos)
	    unbookify_paragraph(pg);
	pg->dquote_level = 0;
	align = TT_ALIGN_NONE;
	pg->parseFlags &= ~RBP_SPACE_OK_HERE;
	break;
      case TAG_BLOCKQUOTE:
      case TAG_DIV:
	pg->bookParaEndTagPos = 0;
	pg->parseFlags &= ~RBP_SPACE_OK_HERE;
	break;
      case TAG_A:
	pg->parseFlags |= RBP_SPACE_OK_HERE;
	if (atts) {
	    const char *name = NULL, *href = NULL, *mmrel = NULL;
	    MBuf_truncate(tmpBuf, 0);
	    for (cpp = atts; *cpp; cpp += 2) {
		if (strcaseEQ(cpp[0], "name"))
		    name = cpp[1];
		else if (strcaseEQ(cpp[0], "href"))
		    href = cpp[1];
		else if (strcaseEQ(cpp[0], "rel"))
		    mmrel = cpp[1];
	    }
	    if (!name)
		name = id;
	    if ((pg->tocFlags & RB_TOCFLAG_UNJOINED_FRAGMENT) && name) {
		if (!(name = strchr(name, RB_JOIN_NAME_SEP))) {
		    ti->flags |= RB_DISCARD_CONTAINER;
		    pg->parseFlags &= ~RBP_SPACE_OK_HERE;
		    break;
		}
		name++;
	    }
	    if (name) {
		int namepos;
		if (*name == '#')
		    name++; /* Fix a common thinko in a name tag */
		if (*name) {
		    MBuf_puts(tmpBuf, " NAME=\"");
		    namepos = tmpBuf->totalLen;
		    if (pg->joinOrd) {
			char *tn = strchr(pg->tocName, '#') + 1;
			MBuf_puts(tmpBuf, tn);
			MBuf_putc(tmpBuf, RB_JOIN_NAME_SEP);
		    }
		    putTagAttrValue(tmpBuf, NULL, name, true);
		    RbPage_noteHidxName(pg, MBuf_dataPtrAt(tmpBuf,namepos,NULL),
					pg->content->totalLen);
		    MBuf_putc(tmpBuf, '"');
		    if (!href && !mmrel)
			pg->paraContentCnt--;
		}
	    }
	    if (href) {
		char *url, *tn;
		int len = strlen(href);

		while (len && ISSPACE(href[len-1])) len--;
		MBuf_truncate(urlBuf, 0);
		UTF8ToMBuf(urlBuf, href, len, '"', true, NULL);
		if (!(href = MBuf_dataPtr(urlBuf, NULL)))
		    href = "";
		if ((pg->tocFlags & RB_TOCFLAG_DEMANGLE)
		 && (*href == 'j' || *href == '#')) {
		    char *rel;
		    url = strchr(href, '#') + 1;
		    rel = strchr(url, RB_JOIN_NAME_SEP);
		    MBuf_truncate(tagBuf, 0);
		    if (rel) {
			MBuf_vwrite(tagBuf, url,rel-url, ".html",5, NULL);
			tn = strrchr(pg->url, '/') + 1;
			if (strEQ(tn, MBuf_dataPtr(tagBuf, NULL)))
			    MBuf_truncate(tagBuf, 0);
			MBuf_vwrite(tagBuf, "#",1, rel+1,-1, NULL);
		    }
		    else
			MBuf_vwrite(tagBuf, url,-1, ".html",5, NULL);
		    href = MBuf_dataPtr(tagBuf, NULL);
		}
		if (!(url = rbBuildURL(href, pg->url)))
		    tn = "about:blank";
		else if (!(tn = RbPage_makeRbRef(pg, url))) {
		    int pt = rbUrlToPageType(url);
		    char *rel = strchr(url, '#');
		    NameWithType *nwt = NULL;
		    if (rel)
			*rel = '\0';
		    switch (pt) {
		      case RB_PAGETYPE_IMAGE:
			if (rb->includeImages && rb->shouldAllowURL(pg, url, pt)
			 && (nwt = RbMake_addPageName(rb, url, pt)) != NULL)
			    rb->scheduleURL(rb, pg, url, nwt->type);
			break;
		      case RB_PAGETYPE_AUDIO:
			if (rb->includeAudio && rb->shouldAllowURL(pg, url, pt)
			 && (nwt = RbMake_addPageName(rb, url, pt)) != NULL)
			    rb->scheduleURL(rb, pg, url, nwt->type);
			break;
		      default:
			if ((rb->followLinks < 0 || rb->followLinks > pg->depth)
			 && rb->shouldAllowURL(pg, url, pt)
			 && (nwt = RbMake_addPageName(rb, url, pt)) != NULL)
			    rb->scheduleURL(rb, pg, url, nwt->type);
		    }
		    if (rel)
			*rel = '#';
		    if (nwt)
			tn = RbPage_makeRbRef(pg, url);
		    else
			tn = url;
		}
		MBuf_vwrite(tmpBuf, " HREF=\"",7, tn,-1, "\"",1, NULL);
		if (*tn == '#')
		    RbPage_usedHidxName(pg, tn+1, 1);
		if (tn != url && *tn != 'a')
		    Mem_free(tn);
		if (url)
		    rbFreeURL(url);
	    }
	    if (mmrel)
		putTagAttrValue(tmpBuf, "REL", mmrel, false);
	    attrStr = MBuf_dataPtrAt(tmpBuf, 0, NULL);
	    align = TT_ALIGN_NONE;
	    id = NULL;
	}
	break;
      case TAG_H1:
      case TAG_H2:
      case TAG_H3:
      case TAG_H4:
      case TAG_H5:
      case TAG_H6:
	pg->bookParaEndTagPos = 0;
	pg->parseFlags &= ~RBP_SPACE_OK_HERE;
	break;
      case TAG_CENTER:
      case TAG_UL:
      case TAG_OL:
      case TAG_LI:
	pg->bookParaEndTagPos = 0;
	align = TT_ALIGN_NONE;
	pg->parseFlags |= RBP_SPACE_OK_HERE;
	break;
      case TAG_HR:
	if (atts) {
	    const char *size = NULL, *width = NULL;
	    bool newpage = false, scenebreak = false;
	    MBuf_truncate(tmpBuf, 0);
	    for (cpp = atts; *cpp; cpp += 2) {
		if (strcaseEQ(cpp[0], "size"))
		    size = cpp[1];
		else if (strcaseEQ(cpp[0], "new-page"))
		    newpage = true;
		else if (strcaseEQ(cpp[0], "scene-break"))
		    scenebreak = true;
		else if (strcaseEQ(cpp[0], "width"))
		    width = cpp[1];
	    }
	    if (newpage) {
		pg->bookParaEndTagPos = 0;
		putTagAttrValue(tmpBuf, "SIZE", "0", false);
	    }
	    else if (scenebreak) {
		if (pg->bookParaEndTagPos)
		    tt = pg->bookParaPTag;
		putTagAttrValue(tmpBuf, "WIDTH", "42", false);
	    }
	    else {
		pg->bookParaEndTagPos = 0;
		if (size) {
		    if (!rb->allowHRSize0PageBreaks && atoi(size) == 0
		     && !(pg->tocFlags & RB_TOCFLAG_HR_SIZE_0_OK))
			size = "1";
		    putTagAttrValue(tmpBuf, "SIZE", size, false);
		}
		if (width) {
		    if (atoi(width) == 42)
			width = "41";
		    putTagAttrValue(tmpBuf, "WIDTH", width, false);
		}
	    }
	    attrStr = MBuf_dataPtrAt(tmpBuf, 0, NULL);
	    align = TT_ALIGN_NONE;
	}
	else
	    pg->bookParaEndTagPos = 0;
	pg->parseFlags |= RBP_SPACE_OK_HERE;
	break;
      case TAG_IMG:
	if (pg->bookParaStartTagPos)
	    unbookify_paragraph(pg);
	if (atts) {
	    const char *src = NULL, *alt = NULL;
	    char *url;
	    MBuf_truncate(tmpBuf, 0);
	    for (cpp = atts; *cpp; cpp += 2) {
		if (strcaseEQ(cpp[0], "src"))
		    src = cpp[1];
		else if (strcaseEQ(cpp[0], "alt"))
		    alt = cpp[1];
#if 0
		else if (strcaseEQ(cpp[0], "border"))
		    border = cpp[1]? cpp[1] : "";
#endif
	    }
	    if (!rb->includeImages) {
		if (alt) {
		    while (ISSPACE(*alt)) alt++;
		    if (*alt) {
			MBuf_vwrite(pg->content, "[Image: ",8, alt,-1, "]",1,
				    NULL);
			return;
		    }
		}
		MBuf_puts(pg->content, "[Image]");
		return;
	    }
	    if (src && (url = rbBuildURL(src, pg->url)) != NULL) {
		NameWithType *nwt = RbMake_findPageName(rb, url);
		if (!nwt && rb->shouldAllowURL(pg, url, RB_PAGETYPE_IMAGE)
		 && (nwt = RbMake_addPageName(rb, url, RB_PAGETYPE_IMAGE)) != NULL)
		    rb->scheduleURL(rb, pg, url, nwt->type);
		rbFreeURL(url);
		if (nwt)
		    MBuf_vwrite(tmpBuf," SRC=\"",6,nwt->name,-1,"\"",1,NULL);
		else
		    return;
	    }
	    else
		return;
	    attrStr = MBuf_dataPtrAt(tmpBuf, 0, NULL);
	}
	else
	    return;
	pg->parseFlags &= ~RBP_SPACE_OK_HERE;
	break;
      case TAG_FONT:
	{
	    int pos = MArray_itemCnt(pg->fontSize) - 1;
	    int sz = MArray_fetchAt(pg->fontSize, pos);
	    if (atts) {
		const char *size = NULL;
		for (cpp = atts; *cpp; cpp += 2) {
		    if (strcaseEQ(cpp[0], "size"))
			size = cpp[1];
		}
		if (size) {
		    int osz = sz;
		    if (*size == '+')
			sz += atoi(size+1);
		    else if (*size == '-')
			sz -= atoi(size+1);
		    else
			sz = atoi(size);
		    if (sz < 1)
			sz = 1;
		    else if (sz > 6)
			sz = 6;
		    if (osz <= 4 && sz >= 5)
			ti = HashTable_fetch(tagHash, "big");
		    else if (osz >= 5 && sz <= 4)
			ti = HashTable_fetch(tagHash, "small");
		}
	    }
	    MArray_append(pg->fontSize, sz);
	}
	break;
      case TAG_DL:
	pg->bookParaEndTagPos = 0;
	ti = HashTable_fetch(tagHash, "div");
	align = TT_ALIGN_NONE;
	pg->parseFlags &= ~RBP_SPACE_OK_HERE;
	break;
      case TAG_DT:
	pg->bookParaEndTagPos = 0;
	ti = HashTable_fetch(tagHash, "br");
	align = TT_ALIGN_NONE;
	pg->parseFlags &= ~RBP_SPACE_OK_HERE;
	break;
      case TAG_DD:
	pg->bookParaEndTagPos = 0;
	ti = HashTable_fetch(tagHash, "blockquote");
	align = TT_ALIGN_NONE;
	pg->parseFlags &= ~RBP_SPACE_OK_HERE;
	break;
      case TAG_TABLE:
	pg->bookParaEndTagPos = 0;
	ti = HashTable_fetch(tagHash, "div");
	align = TT_ALIGN_LEFT;
	pg->parseFlags &= ~RBP_SPACE_OK_HERE;
	break;
      case TAG_TR:
	pg->bookParaEndTagPos = 0;
	break;
      case TAG_TD:
      case TAG_TH:
	pg->bookParaEndTagPos = 0;
	ti = HashTable_fetch(tagHash, "blockquote");
	align = TT_ALIGN_NONE;
	pg->parseFlags &= ~RBP_SPACE_OK_HERE;
	break;
      case TAG_SCRIPT:
      case TAG_STYLE:
      case TAG_NOFRAMES:
      case TAG_NOSCRIPT:
	break;
      default:
	fprintf(stderr, "switch in rbStartElement() is out of sync with array data (%d).\n",
		ti->elnum);
	break;
    }

    if (pg->discardHtmlLevel) {
	if (ctxt->nameNr >= pg->discardHtmlLevel)
	    return;
	pg->discardHtmlLevel = 0; /* Impossible to get here? */
    }
    if (ti->flags & RB_DISCARD_CONTAINER) {
	pg->discardHtmlLevel = ctxt->nameNr;
	return;
    }
    checkDelayedChars(pg->content, pg, 0);
    if (ti->flags & RB_TRIM_WS) {
	int size = pg->content->totalLen;
	const char *cp = MBuf_dataPtrAt(pg->content, size-1, NULL);
	if (cp && ISSPACE(*cp))
	    MBuf_truncate(pg->content, size-1);
    }

    if (ti->flags & RB_IN_SUBSET) {
	if (!dropTag) {
	    MBuf_truncate(tagBuf, 0);
	    MBuf_vwrite(tagBuf, "<", 1, ti->tag, -1, NULL);
	    if (attrStr)
		MBuf_puts(tagBuf, attrStr);
	    if (align)
		putTagAttrValue(tagBuf, "ALIGN", alignStrs[align], false);
	    if (id)
		putTagAttrValue(tagBuf, "ID", id, true);
	    MBuf_putc(tagBuf, '>');
	    MBuf_write(pg->content, MBuf_dataPtrAt(tagBuf, 0, NULL),
		       tagBuf->totalLen);
	}
	if (ti->flags & (RB_NOTE_PARA | RB_NOTE_TAG)) {
	    if (ti->flags & RB_NOTE_TAG)
		tt = noteTag(pg, ti->elnum, align);
	    if (ti->flags & RB_NOTE_PARA) {
		RbPage_noteHidxPara(pg, overrideParaPos? overrideParaPos
				      : pg->content->totalLen, tt);
	    }
	    if (!(ti->flags & RB_NO_CLOSE_TAG))
		pg->tagTreePos = tt;

	    if (ti->elnum == TAG_BODY && ctxt->nameNr == 2) {
		if (rb->coverImage && pg->firstPage) {
		    MBuf_puts(pg->content, "\n<IMG SRC=\"cover.png\">\n<HR SIZE=0>");
		    RbPage_noteHidxPara(pg, pg->content->totalLen, tt);
		}
		if (pg->joinOrd) {
		    char *tn = strchr(pg->tocName, '#') + 1;
		    if (pg->joinOrd > 1) {
			MBuf_puts(pg->content, "\n<HR SIZE=0>");
			RbPage_noteHidxPara(pg, pg->content->totalLen, tt);
		    }
		    RbPage_noteHidxName(pg, tn, pg->content->totalLen);
		    MBuf_vwrite(pg->content, "<A NAME=\"",9, tn,-1,
				"\"></A>\n",7, NULL);
		}
	    }
	}
    }

    if (extraStr)
	MBuf_puts(pg->content, extraStr);
}

static TagTree *
noteTag(RbPage *pg, int elnum, int align)
{
    TagTree *tt, *prevOrd;

    for (tt = pg->tagTreePos->child; tt; tt = tt->sibling) {
	if (elnum == tt->elnum && align == tt->align)
	    return tt;
    }

    prevOrd = pg->tagTreeRoot->sibling;
    tt = Mem_alloc(sizeof (TagTree));
    tt->elnum = elnum;
    tt->align = align;
    tt->ord = 0;
    tt->parent = pg->tagTreePos;
    tt->child = tt->nextOrd = NULL;
    tt->sibling = pg->tagTreePos->child;
    pg->tagTreePos->child = prevOrd->nextOrd = pg->tagTreeRoot->sibling = tt;

    return tt;
}

static void
unbookify_paragraph(RbPage *pg)
{
    int bufLen = pg->content->totalLen - STATICLEN(INDENT_STR);
    int paraPos = pg->bookParaStartTagPos + 3;
    int contentPos = paraPos + STATICLEN(INDENT_STR);
    if (pg->bookParaEndTagPos) {
	MBuf_overwrite(pg->content, pg->bookParaEndTagPos+1, "/P", 2);
	MBuf_overwrite(pg->content, paraPos - 3, "<P>", 3);
	bufLen += 3;
	contentPos -= 3;
	RbPage_delLastHidxPara(pg, TAG_BR, 0);
	RbPage_noteHidxPara(pg, paraPos, pg->bookParaPTag);
	pg->bookParaEndTagPos = 0;
    }
    MBuf_memcpy(pg->content, paraPos, contentPos, bufLen - paraPos);
    MBuf_truncate(pg->content, bufLen);
    pg->bookParaStartTagPos = 0;
    if (pg->hkeyParaPos)
	pg->hkeyParaPos = paraPos;
}

#define ZZZ 0
unsigned char xlatHighChars[128] = {
    /*80*/ ZZZ, ZZZ, ZZZ, 'f', ZZZ, ZZZ, ZZZ, ZZZ,
    /*88*/ '^', ZZZ, 'S', ZZZ, ZZZ, ZZZ, 'Z', ZZZ,
    /*90*/ ZZZ,'\'','\'', ZZZ, ZZZ, ZZZ, '-', '-',
    /*98*/ ZZZ, ZZZ, 'S', ZZZ, ZZZ, ZZZ, ZZZ, ZZZ,
    /*A0*/ ' ' ,ZZZ, ZZZ, ZZZ, ZZZ, ZZZ, ZZZ, ZZZ,
    /*A8*/ ZZZ, ZZZ, ZZZ, ZZZ, ZZZ, ZZZ, ZZZ, ZZZ,
    /*B0*/ ZZZ, ZZZ, ZZZ, ZZZ, ZZZ, ZZZ, ZZZ, ZZZ,
    /*B8*/ ZZZ, ZZZ, ZZZ, ZZZ, ZZZ, ZZZ, ZZZ, ZZZ,
    /*C0*/ 'A', 'A', 'A', 'A', 'A', 'A', ZZZ, 'C',
    /*C8*/ 'E', 'E', 'E', 'E', 'I', 'I', 'I', 'I',
    /*D0*/ 'D', 'N', 'O', 'O', 'O', 'O', 'O', 'x',
    /*D8*/ 'O', 'U', 'U', 'U', 'U', 'Y', ZZZ, 'B',
    /*E0*/ 'a', 'a', 'a', 'a', 'a', 'a', ZZZ, 'c',
    /*E8*/ 'e', 'e', 'e', 'e', 'i', 'i', 'i', 'i',
    /*F0*/ 'o', 'n', 'o', 'o', 'o', 'o', 'o', ZZZ,
    /*F8*/ 'o', 'u', 'u', 'u', 'u', 'y', ZZZ, 'y',
};
#undef ZZZ

static void
rbEndElement(void *userPtr, const xmlChar *tagnameX)
{
    RbPage *pg = (RbPage*)userPtr;
    const char *tagname = (const char*)tagnameX;
    RbMake *rb = pg->rb;
    htmlParserCtxtPtr ctxt = pg->ctxt;
    bool dropTag = false;
    TagInfo *ti;

    if (pg->discardPage || !(ti = HashTable_fetch(tagHash, tagname))
     || (ti->flags & RB_NO_CLOSE_TAG))
	return;

    if (pg->discardHtmlLevel) {
	if (ctxt->nameNr >= pg->discardHtmlLevel) {
	    if (ctxt->nameNr == pg->discardHtmlLevel) {
		pg->discardHtmlLevel = 0;
		if (ti->elnum == TAG_A)
		    ti->flags &= ~RB_DISCARD_CONTAINER;
	    }
	    return;
	}
	pg->discardHtmlLevel = 0; /* Impossible to get here? */
    }
    checkDelayedChars(pg->content, pg, 0);

    switch (ti->elnum) {
      case TAG_HTML:
	pg->parseFlags |= RBP_SPACE_OK_HERE;
	if (pg->joinOrd)
	    return;
	break;
      case TAG_HEAD:
	if (!(pg->tocFlags & RB_TOCFLAG_MENUMARK_FILE)) {
	    MBuf_vwrite(pg->content,
		"<META HTTP-EQUIV=\"Content-Type\" CONTENT=\"text/html; charset=windows-1252\">\n",-1,
		"<META NAME=\"GENERATOR\" CONTENT=\"",-1,
		RbMake_getGenerator(rb),-1, "\">\n",3, NULL);
	    pg->parseFlags &= ~RBP_SPACE_OK_HERE;
	}
	break;
      case TAG_TITLE:
	if (pg->titlePos) {
	    if (!RbInfoHash_fetch(rb->infoHash, "TITLE")) {
		int len = pg->content->totalLen - pg->titlePos;
		char buf[256];
		if (len > sizeof buf)
		    len = sizeof buf - 1;
		MBuf_setReadPos(pg->content, pg->titlePos, 0);
		len = MBuf_read(pg->content, buf, len);
		buf[len] = '\0';
		removeAmpersandEntities(buf);
		if (*buf)
		    RbInfoHash_store(rb->infoHash, "TITLE", buf);
	    }
	    pg->titlePos = 0;
	}
	pg->parseFlags |= RBP_SPACE_OK_HERE;
	break;
      case TAG_FRAMESET:
      case TAG_BODY:
	if (pg->joinOrd) {
	    pg->parseFlags &= ~RBP_SPACE_OK_HERE;
	    return;
	}
	break;
      case TAG_PRE:
	{
	    int size = pg->content->totalLen;
	    const char *cp = MBuf_dataPtrAt(pg->content, size-1, NULL);
	    if (cp && ISSPACE(*cp))
		MBuf_truncate(pg->content, size-1);
	}
	pg->includeRawWhitespaceLevel--;
	pg->parseFlags |= RBP_SPACE_OK_HERE;
	break;
      case TAG_P:
	if (pg->bookParaStartTagPos && ctxt->nameNr == rb->bookParagraphDepth) {
	    int bufLen = pg->content->totalLen;
	    const char *cp = MBuf_dataPtrAt(pg->content, bufLen-1, NULL);
	    if (cp && ISSPACE(*cp))
		MBuf_truncate(pg->content, --bufLen);
	    if (!pg->paraContentCnt) {
		BumpInfo bump;
		bump.at = pg->bookParaStartTagPos;
		bump.by = STATICLEN(INDENT_STR);
		if (!pg->bookParaEndTagPos)
		    bump.by += 3;
		MBuf_memcpy(pg->content, bump.at, bump.at + bump.by,
			    bufLen - bump.at - bump.by);
		MBuf_truncate(pg->content, bufLen - bump.by);
		if (pg->bookParaEndTagPos)
		    MBuf_overwrite(pg->content,pg->bookParaEndTagPos+1,"/P",2);
		HashTable_walk(pg->names, &bump, bumpHidxNames);
		RbPage_delLastHidxPara(pg, 0, 0);
		dropTag = 1;
	    }
	    else {
		if (!pg->bookParaEndTagPos) {
		    /* Use lower-case "p" to signal rbburst to undo this. */
		    MBuf_overwrite(pg->content,pg->bookParaStartTagPos+1,"p",1);
		}
		pg->bookParaEndTagPos = pg->content->totalLen;
		pg->bookParaPTag = pg->tagTreePos;
	    }
	    pg->bookParaStartTagPos = 0;
	    pg->parseFlags &= ~RBP_SPACE_OK_HERE;
	}
	else
	    pg->parseFlags |= RBP_SPACE_OK_HERE;
	pg->hkeyScanMode = HKEY_INACTIVE;
	if (pg->hkeyPrevWord) {
	    Mem_free(pg->hkeyPrevWord);
	    pg->hkeyPrevWord = NULL;
	}
	break;
      case TAG_BIG:
	switch (pg->hkeyScanMode) {
	  case HKEY_INACTIVE:
	    break;
	  case HKEY_FIND_1ST_B:
	    pg->hkeyScanMode = HKEY_INACTIVE;
	    break;
	  case HKEY_FIND_B_AFTER_BIG:
	    pg->hkeyScanMode = HKEY_FIND_BIG_OR_SMALL;
	    break;
	  case HKEY_FIND_CLOSING_BIG:
	    parseHKeyItem(pg);
	    pg->hkeyScanMode = HKEY_FIND_BIG_OR_SMALL;
	    break;
	}
	pg->parseFlags |= RBP_SPACE_OK_HERE;
	break;
      case TAG_SMALL:
	switch (pg->hkeyScanMode) {
	  case HKEY_INACTIVE:
	    break;
	  case HKEY_FIND_B_AFTER_SMALL:
	    pg->hkeyScanMode = HKEY_FIND_BIG_OR_SMALL;
	    break;
	  case HKEY_FIND_CLOSING_SMALL:
	    parseHKeyItem(pg);
	    pg->hkeyScanMode = HKEY_FIND_BIG_OR_SMALL;
	    break;
	}
	pg->parseFlags |= RBP_SPACE_OK_HERE;
	break;
      case TAG_B:
      case TAG_I:
      case TAG_U:
      case TAG_S:
      case TAG_TT:
      case TAG_CODE:
      case TAG_SUP:
      case TAG_SUB:
      case TAG_A:
	pg->parseFlags |= RBP_SPACE_OK_HERE;
	break;
      case TAG_BLOCKQUOTE:
      case TAG_CENTER:
      case TAG_DIV:
      case TAG_H1:
      case TAG_H2:
      case TAG_H3:
      case TAG_H4:
      case TAG_H5:
      case TAG_H6:
      case TAG_LI:
      case TAG_OL:
      case TAG_UL:
	pg->bookParaEndTagPos = 0;
	pg->parseFlags |= RBP_SPACE_OK_HERE;
	break;
      case TAG_EM:
	ti = HashTable_fetch(tagHash, "i");
	pg->parseFlags |= RBP_SPACE_OK_HERE;
	break;
      case TAG_STRONG:
	ti = HashTable_fetch(tagHash, "b");
	pg->parseFlags |= RBP_SPACE_OK_HERE;
	break;
      case TAG_FONT:
	{
	    int pos = MArray_itemCnt(pg->fontSize) - 1;
	    int osz = MArray_fetchAt(pg->fontSize, pos);
	    int sz = MArray_fetchAt(pg->fontSize, pos - 1);
	    MArray_truncate(pg->fontSize, pos);
	    if (osz >= 5 && sz <= 4)
		ti = HashTable_fetch(tagHash, "big");
	    else if (osz <= 4 && sz >= 5)
		ti = HashTable_fetch(tagHash, "small");
	}
	break;
      case TAG_DL:
	ti = HashTable_fetch(tagHash, "div");
	pg->bookParaEndTagPos = 0;
	pg->parseFlags |= RBP_SPACE_OK_HERE;
	break;
      case TAG_DT:
	pg->bookParaEndTagPos = 0;
	return;
      case TAG_DD:
	ti = HashTable_fetch(tagHash, "blockquote");
	pg->bookParaEndTagPos = 0;
	pg->parseFlags |= RBP_SPACE_OK_HERE;
	break;
      case TAG_TABLE:
	ti = HashTable_fetch(tagHash, "div");
	pg->bookParaEndTagPos = 0;
	pg->parseFlags |= RBP_SPACE_OK_HERE;
	break;
      case TAG_TR:
	ti = HashTable_fetch(tagHash, "blockquote");
	if (pg->tagTreePos->elnum == ti->elnum) {
	    while (pg->tagTreePos->parent->elnum == ti->elnum) {
		MBuf_puts(pg->content, "</BLOCKQUOTE>");
		pg->tagTreePos = pg->tagTreePos->parent;
	    }
	}
	pg->bookParaEndTagPos = 0;
	pg->parseFlags |= RBP_SPACE_OK_HERE;
	break;
      case TAG_TD:
      case TAG_TH:
	pg->bookParaEndTagPos = 0;
	return;
      default:
	fprintf(stderr, "switch in rbEndElement() is out of sync with array data (%d).\n",
		ti->elnum);
	return;
    }

    if (pg->tagTreePos->elnum == ti->elnum)
	pg->tagTreePos = pg->tagTreePos->parent;

    if (ti->flags & RB_IN_SUBSET)
	MBuf_vwrite(pg->content, "</", 2, ti->tag, -1, ">", 1, NULL);
}

static int
bumpHidxNames(void *userPtr, const char *key, void *obj)
{
    BumpInfo *bump = userPtr;
    HtmlPosition *hp = obj;
    if (hp->htmlOffset > bump->at)
	hp->htmlOffset -= bump->by;
    return 1;
}

static void
parseHKeyItem(RbPage *pg)
{
    StringWithPosition *swp;
    unsigned char ch;
    char *bp, *t, *f;
    int len, preLen, periodCnt;
    short sufPos[4];

  start:
    len = pg->content->totalLen - pg->hkeyWordPos;
    preLen = pg->hkeyPrevWord? strlen(pg->hkeyPrevWord) : 0;
    periodCnt = 0;
    memset(sufPos, 0, sizeof sufPos);

    swp = Mem_calloc(1, len + preLen + sizeof *swp);
    swp->pos.htmlOffset = pg->hkeyParaPos - 3;
    swp->pos.joinOrd = pg->joinOrd;

    bp = t = f = swp->string + preLen;
    MBuf_setReadPos(pg->content, pg->hkeyWordPos, 0);
    MBuf_read(pg->content, f, len);

    while (ISSPACE(*f)) f++;
    if (*f == '(') f++;

    while ((ch = uc(f,0)) != '\0') {
	f++;
	switch (ch) {
	  case '<':
	    do {
		if (!*f)
		    goto terminate_string;
	    } while (*f++ != '>');
	    break;
	  case '&':
	    do {
		if (!*f)
		    goto terminate_string;
	    } while (*f++ != ';');
	    /* Just dump the entity */
	    break;
	  case ';':
	  case ',':
	    goto terminate_string;
	  case '.':
	    periodCnt++;
	    /* FALL THROUGH */
	  default:
	    if (ch >= 0x80) {
		ch = xlatHighChars[ch-0x80];
		if (ch == 0) {
		    int i;
		    for (i = (sizeof sufPos) / (sizeof (short)); --i > 0; )
			sufPos[i] = sufPos[i-1];
		    sufPos[0] = t - bp;
		    break;
		}
	    }
	    else if (ISUPPER(ch))
		ch = TOLOWER(ch);
	    else if (ISSPACE(ch))
		ch = ' ';
	    else if (ch == '-') {
		int i;
		for (i = (sizeof sufPos) / (sizeof (short)); --i > 0; )
		    sufPos[i] = sufPos[i-1];
		sufPos[0] = t - bp + 1;
	    }
	    *t++ = ch;
	    break;
	}
    }
  terminate_string:
    pg->hkeyWordPos += f - bp;
    while (t > bp && ISSPACE(t[-1])) t--;
    if (periodCnt == 1 && t[-1] == '.') {
	t--;
	while (t > bp && ISSPACE(t[-1])) t--;
    }
    if (t > bp && t[-1] == ')') t--;
    *t = '\0';
    len = t - bp;
    t = swp->string;

    if (preLen) {
	if (*bp == '-') {
	    int l1 = pg->hkeySufPos[0] - pg->hkeySufPos[1];
	    int l2 = pg->hkeySufPos[0] - pg->hkeySufPos[2];
	    strcpy(t, pg->hkeyPrevWord);
	    bp++;
	    if (pg->hkeySufPos[2] && strnEQ(t + pg->hkeySufPos[2], bp, l2))
		bp += l2;
	    else if (strnEQ(t + pg->hkeySufPos[1], bp, l1)
		  && strnNE(t + pg->hkeySufPos[0], bp, l1)
		  && (l1 != 2 || strnNE(bp, "ti", 2)
		   || strNE(t + pg->hkeySufPos[0], "ty")))
		bp += l1;
	    strcpy(t + pg->hkeySufPos[0], bp);
	    sufPos[0] = -1;
	}
	else
	    strcpy(t, bp);
	len = strlen(t);
    }

    if (len) {
	if (sufPos[0] > 0) {
	    int i;
	    if (pg->hkeyPrevWord)
		Mem_free(pg->hkeyPrevWord);
	    pg->hkeyPrevWord = Mem_alloc(len + 1);
	    strcpy(pg->hkeyPrevWord, t);
	    if (sufPos[0] == len) {
		int j = (sizeof sufPos) / (sizeof (short)) - 1;
		for (i = 0; i < j; i++)
		    sufPos[i] = sufPos[i+1];
	    }
	    for (i = (sizeof pg->hkeySufPos) / (sizeof (short)); i-- > 0; )
		pg->hkeySufPos[i] = sufPos[i];
	}
	MArray_appendPtr(pg->keys, swp);
    }
    else
	Mem_free(swp);
    if (ch == ',' || ch == ';')
	goto start;
}

static void
rbCharacters(void *userPtr, const xmlChar *strX, int len)
{
    RbPage *pg = (RbPage*)userPtr;
    const char *str = (const char*)strX;
    htmlParserCtxtPtr ctxt = pg->ctxt;

    if (pg->discardPage
     || (pg->discardHtmlLevel && ctxt->nameNr >= pg->discardHtmlLevel))
	return;

    switch (pg->hkeyScanMode) {
      case HKEY_FIND_1ST_BIG:
      case HKEY_FIND_1ST_B:
      case HKEY_FIND_B_AFTER_BIG:
      case HKEY_FIND_B_AFTER_SMALL:
	pg->hkeyScanMode = HKEY_INACTIVE;
	break;
      case HKEY_FIND_BIG_OR_SMALL:
      case HKEY_FIND_CLOSING_BIG:
      case HKEY_FIND_CLOSING_SMALL:
	pg->parseFlags |= RBP_CONVERT_NL2SP;
	break;
      default:
	pg->parseFlags &= ~RBP_CONVERT_NL2SP;
	break;
    }
    if (UTF8ToMBuf(pg->content, str, len, 0, false, pg)) {
	/* We saw a non-space char */
	if (ctxt->nameNr == pg->rb->bookParagraphDepth-1)
	    pg->bookParaEndTagPos = 0;
	pg->paraContentCnt = 1;
    }
}

static void
rbParserWarning(void *userPtr, const char *fmt, ...)
{
    va_list args;
    RbPage *pg = (RbPage*)userPtr;
    xmlParserInputPtr input = pg->ctxt->input;

    if (!pg->rb->verboseOutput)
	return;

    MBuf_truncate(warnBuf, 0);
    cacheXmlError(NULL, "Warning in ");
    xmlParserPrintFileInfo(input);
    cacheXmlError(NULL, "\n");
    va_start(args, fmt);
    vcacheXmlError(fmt, args);
    va_end(args);

    xmlParserPrintFileContext(input);

    RbError_warn("%s", MBuf_dataPtr(warnBuf, NULL));
}

static void
rbParserError(void *userPtr, const char *fmt, ...)
{
    va_list args;
    RbPage *pg = (RbPage*)userPtr;
    xmlParserInputPtr input = pg->ctxt->input;

    if (!pg->rb->verboseOutput)
	return;

    MBuf_truncate(warnBuf, 0);
    cacheXmlError(NULL, "Error in ");
    xmlParserPrintFileInfo(input);
    cacheXmlError(NULL, "\n");
    va_start(args, fmt);
    vcacheXmlError(fmt, args);
    va_end(args);

    xmlParserPrintFileContext(input);

    RbError_warn("%s", MBuf_dataPtr(warnBuf, NULL));
}

static void
cacheXmlError(void *ctx, const char *fmt, ...)
{
    va_list args;
    va_start(args, fmt);
    vcacheXmlError(fmt, args);
    va_end(args);
}

static void
vcacheXmlError(const char *fmt, va_list args)
{
    int addLen, curLen = warnBuf->totalLen;
    char *buf;

#ifdef HAVE_VSNPRINTF
    if ((addLen = vsnprintf(NULL, 0, fmt, args)) < 0)
	RbError_exit("libc has incompatible version of vsnprintf()\n");
#else
    if ((addLen = warnBuf->allocLen - warnBuf->totalLen) < 2048)
	addLen = 4096;
#endif
    MBuf_extend(warnBuf, addLen);
    buf = MBuf_dataPtrAt(warnBuf, curLen, NULL);
    if (buf) {
	vsprintf(buf, fmt, args);
#ifndef HAVE_VSNPRINTF
	MBuf_truncate(warnBuf, curLen + strlen(buf));
#endif
    }
}

xmlSAXHandler rbHtmlHandler = {
    NULL,		/* internalSubset */
    NULL,
    NULL,
    NULL,
    NULL,
    NULL,		/* getEntity */
    NULL,
    NULL,
    NULL,
    NULL,
    NULL,
    NULL,		/* setDocumentLocator */
    NULL,
    NULL,
    rbStartElement,
    rbEndElement,
    NULL,
    rbCharacters,
    rbCharacters,	/* ignorableWhitespace */
    NULL,
    NULL,		/* comment */
    rbParserWarning,
    rbParserError,
    rbParserError,
    NULL,		/* getParameterEntity */
    NULL,
    NULL,
};

void
RbHtml_parsedPushFunc(void *userPtr, const char *bp, int len)
{
    RbPage *pg = (RbPage*)userPtr;
    if (!pg->ctxt) {
	int four;
	if (len <= 0)
	    return;
	four = len > 4? 4 : len;
	pg->ctxt = htmlCreatePushParserCtxt(&rbHtmlHandler, pg, bp, four,
					pg->url, pg->charEncoding);
	if ((len -= four) <= 0)
	    return;
	bp += four;
    }
    while (len > RB_PARSE_CHUNK_SIZE) {
	htmlParseChunk(pg->ctxt, bp, RB_PARSE_CHUNK_SIZE, 0);
	len -= RB_PARSE_CHUNK_SIZE;
	bp += RB_PARSE_CHUNK_SIZE;
    }
    htmlParseChunk(pg->ctxt, bp, len, 0);
}

static char tagHtmlBody[] = "<HTML><BODY>";
static char tagPre[] = "<PRE>";
static char tagP[] = "<P>";

void
RbHtml_parsedTextPushFunc(void *userPtr, const char *bp, int len)
{
    RbPage *pg = (RbPage*)userPtr;
    const char *cp;

    if (!pg->ctxt) {
	if (len <= 0)
	    return;
	pg->ctxt = htmlCreatePushParserCtxt(&rbHtmlHandler, pg,
				    tagHtmlBody, STATICLEN(tagHtmlBody),
				    pg->url, pg->charEncoding);
	switch (pg->rb->textConversionMode) {
	  case RB_TEXTCONV_PRE:
	    htmlParseChunk(pg->ctxt, tagPre, STATICLEN(tagPre), 0);
	    break;
	  case RB_TEXTCONV_SIMPLE_PARA:
	    htmlParseChunk(pg->ctxt, tagP, STATICLEN(tagP), 0);
	    break;
	}
    }
    for (cp = bp; len-- > 0; cp++) {
	switch (*cp) {
	  case '<':
	    if (cp > bp)
		htmlParseChunk(pg->ctxt, bp, cp - bp, 0);
	    htmlParseChunk(pg->ctxt, "&lt;", 4, 0);
	    bp = cp+1;
	    break;
	  case '>':
	    if (cp > bp)
		htmlParseChunk(pg->ctxt, bp, cp - bp, 0);
	    htmlParseChunk(pg->ctxt, "&gt;", 4, 0);
	    bp = cp+1;
	    break;
	  case '&':
	    if (cp > bp)
		htmlParseChunk(pg->ctxt, bp, cp - bp, 0);
	    htmlParseChunk(pg->ctxt, "&amp;", 5, 0);
	    bp = cp+1;
	    break;
	  case '\r':
	    if (pg->rb->textConversionMode == RB_TEXTCONV_SIMPLE_PARA && len) {
		if (cp[1] == '\n' && cp[2] == '\r') {
		    if (cp[3] == '\n')
			cp += 3, len -= 3;
		    else
			cp += 2, len -= 2;
		}
		else if (cp[1] == '\r')
		    cp++, len--;
		else
		    break;    
		htmlParseChunk(pg->ctxt, bp, cp - bp, 0);
		htmlParseChunk(pg->ctxt, tagP, STATICLEN(tagP), 0);
		bp = cp+1;
	    }
	    break;
	  case '\n':
	    if (pg->rb->textConversionMode == RB_TEXTCONV_SIMPLE_PARA && len
	     && cp[1] == '\n') {
		len--;
		cp++;
		htmlParseChunk(pg->ctxt, bp, cp - bp, 0);
		htmlParseChunk(pg->ctxt, tagP, STATICLEN(tagP), 0);
		bp = cp+1;
	    }
	    break;
	}
    }
    if (cp > bp)
	htmlParseChunk(pg->ctxt, bp, cp - bp, 0);
}

void
RbHtml_flushParsedPush(RbPage *pg)
{
    if (pg->ctxt) {
	htmlParseChunk(pg->ctxt, NULL, 0, 1);
	htmlFreeParserCtxt(pg->ctxt);
	pg->ctxt = NULL;
    }
}

/* This translates flen-bytes of buffer f from UTF8 into HTML (using CP1252
 * characters) with really odd-ball characters turned into entities, and
 * optional: quoting of a quotation character, enchanced punctuation, and
 * URL-type escaping.  The result is appended onto the end of "mb". */
static bool
UTF8ToMBuf(MBuf *mb, const char *f, int flen, char quoteChar,
	   bool urlEscape, RbPage *pg)
{
    unsigned int u;
    int ch, trailing;
    int tweakPunctuation = 0;
    bool tweakSpacing = false, sawNonSpace = false;

    if (pg) {
	if (!(pg->tocFlags & RB_TOCFLAG_MENUMARK_FILE))
	    tweakPunctuation = pg->rb->enhancePunctuation;
	if (!(tweakSpacing = !pg->includeRawWhitespaceLevel))
	    pg->parseFlags |= RBP_SPACE_OK_HERE;
    }

    while (flen > 0) {
	ch = uc(f++,0);
	flen--;
	if (ch < 0xC0)
	    u = ch, trailing = 0;
        else if (ch < 0xE0)
	    u = ch & 0x1F, trailing = 1;
        else if (ch < 0xF0)
	    u = ch & 0x0F, trailing = 2;
        else if (ch < 0xF8)
	    u = ch & 0x07, trailing = 3;
	else
	    u = ch, trailing = 0;

	if (flen < trailing)
	    break;

	while (trailing--) {
	    ch = uc(f++,0);
	    flen--;
	    if ((ch & 0xC0) != 0x80)
		break;
	    u = (u << 6) | (ch & 0x3F);
	}

	if (pg && checkDelayedChars(mb, pg, u))
	    continue;

	if (u <= 0xFF && u != quoteChar) {
	    if (tweakSpacing) {
		if (ISSPACE(u)) {
		    if (!(pg->parseFlags & RBP_SPACE_OK_HERE))
			continue;
		    if (u == '\r')
			u = '\n';
		    if (u == '\t'
		     || (u == '\n' && (pg->parseFlags & RBP_CONVERT_NL2SP)))
			u = ' ';
		    pg->parseFlags &= ~RBP_SPACE_OK_HERE;
		}
		else {
		    pg->parseFlags |= RBP_SPACE_OK_HERE;
		    sawNonSpace = true;
		}
	    }
	    else if (u == '\r') {
		if (*f == '\n')
		    continue;
		u = '\n';
	    }
	    switch (u) {
	      case '&':
		if (urlEscape)
		    MBuf_putc(mb, u);
		else
		    MBuf_puts(mb, "&amp;");
		break;
	      case '<':
		if (urlEscape)
		    MBuf_write(mb, urlEscapeChar(u), 3);
		else
		    MBuf_puts(mb, "&lt;");
		break;
	      case '>':
		if (urlEscape)
		    MBuf_write(mb, urlEscapeChar(u), 3);
		else
		    MBuf_puts(mb, "&gt;");
		break;
	      case '`':
		if (tweakPunctuation & RB_ENHANCE_SQUOTES)
		    MBuf_putc(mb, RB_CH_LSQUO);
		else if (urlEscape)
		    MBuf_write(mb, urlEscapeChar(u), 3);
		else
		    MBuf_putc(mb, u);
		break;
	      case '\'':
		if (tweakPunctuation & RB_ENHANCE_SQUOTES)
		    MBuf_putc(mb, RB_CH_RSQUO);
		else
		    MBuf_putc(mb, u);
		break;
	      case '"':
		if (tweakPunctuation & RB_ENHANCE_DQUOTES) {
		    /* Try to handle nested double quotes */
		    if (pg->dquote_level++)
			pg->parseFlags |= RBP_SAW_DQUOTE;
		    else
			MBuf_putc(mb, RB_CH_LDQUO);
		}
		else if (urlEscape)
		    MBuf_write(mb, urlEscapeChar(u), 3);
		else
		    MBuf_putc(mb, u);
		break;
	      case '.':
		if ((tweakPunctuation & RB_ENHANCE_ELLIPSES)
		 && !(pg->parseFlags & RBP_SAW_DOT1)) {
		    pg->pre_ellipsis_len = mb->totalLen;
		    pg->parseFlags |= RBP_SAW_DOT1;
		}
		MBuf_putc(mb, u);
		break;
	      case '-':
		if (tweakPunctuation & RB_ENHANCE_EMDASHES)
		    pg->parseFlags |= RBP_SAW_DASH;
		MBuf_putc(mb, u);
		break;
	      case ' ':
	      case '\t':
		if (urlEscape)
		    MBuf_write(mb, urlEscapeChar(u), 3);
		else
		    MBuf_putc(mb, u);
		break;
	      case '\n':
		if (!urlEscape)
		    MBuf_putc(mb, u);
		break;
	      case '^':
	      case '{':
	      case '}':
	      case '[':
	      case ']':
	      case '|':
		if (urlEscape)
		    MBuf_write(mb, urlEscapeChar(u), 3);
		else
		    MBuf_putc(mb, u);
		break;
	      default:
		if (urlEscape && u >= 0x80)
		    MBuf_write(mb, urlEscapeChar(u), 3);
		else
		    MBuf_putc(mb, u);
		break;
	    }
	}
	else if (u == 8194 || u == 8195) /* &ensp; & &emsp; */
	    MBuf_puts(mb, "&nbsp; ");
	else if (u == 8201)		 /* &thinsp; */
	    MBuf_puts(mb, "&nbsp;");
	else {
	    const htmlEntityDesc *ent;
	    unsigned char ch;

	    if (tweakSpacing)
		pg->parseFlags |= RBP_SPACE_OK_HERE;
	    sawNonSpace = true;

	    switch (u) {
	      case 8364: ch = 128; break;/* Euro sign */
	      case 8218: ch = 130; break;/* Single low-9 quotation mark */
	      case  402: ch = 131; break;/* Latin 'f' with hook */
	      case 8222: ch = 132; break;/* Double low-9 quotation mark */
	      case 8230: ch = 133; break;/* Horizontal ellipsis */
	      case 8224: ch = 134; break;/* Dagger */
	      case 8225: ch = 135; break;/* Double dagger */
	      case  710: ch = 136; break;/* Modifier letter circumflex accent */
	      case 8240: ch = 137; break;/* Per mille sign */
	      case  352: ch = 138; break;/* Latin 'S' with caron */
	      case 8249: ch = 139; break;/* Single left-pointing angle quote */
	      case  338: ch = 140; break;/* Latin capital ligature OE */
	      case  381: ch = 142; break;/* Latin 'Z' with caron */
	      case 8216: ch = 145; break;/* Left single quote */
	      case 8217: ch = 146; break;/* Right single quote */
	      case 8220: ch = 147; break;/* Left double quote */
	      case 8221: ch = 148; break;/* Right double quote */
	      case 8226: ch = 149; break;/* Bullet */
	      case 8211: ch = 150; break;/* En dash */
	      case 8212: ch = 151; break;/* Em dash */
	      case  732: ch = 152; break;/* Small tilde */
	      case 8482: ch = 153; break;/* Trade mark sign */
	      case  353: ch = 154; break;/* Latin 's' with caron */
	      case 8250: ch = 155; break;/* Single right-pointing angle quote */
	      case  339: ch = 156; break;/* Latin small ligature oe */
	      case  382: ch = 158; break;/* Latin 'z' with caron */
	      case  376: ch = 159; break;/* Latin 'Y' with diaeresis */
	      default:
		ent = htmlEntityValueLookup(u);
		if (ent)
		    MBuf_vwrite(mb, "&",1, ent->name,-1, ";",1, NULL);
		else {
		    char nbuf[16];
		    sprintf(nbuf, "&#%d;", u);
		    MBuf_puts(mb, nbuf);
		}
		continue;
	    }
	    if (urlEscape)
		MBuf_write(mb, urlEscapeChar(ch), 3);
	    else
		MBuf_putc(mb, ch);
	}
    }
    return sawNonSpace;
}

static bool
checkDelayedChars(MBuf *mb, RbPage *pg, unsigned int u)
{
    int bits = pg->parseFlags & RBP_ALL_SAW_BITS;
    if (!bits)
	return 0;
    switch (bits) {
      case RBP_SAW_DQUOTE:
	if (u < 0x7F && isalnum(u))
	    MBuf_putc(mb, RB_CH_LDQUO);
	else {
	    pg->dquote_level -= 2;
	    MBuf_putc(mb, RB_CH_RDQUO);
	}
	pg->parseFlags &= ~RBP_SAW_DQUOTE;
	break;
      case RBP_SAW_DASH:
	pg->parseFlags &= ~RBP_SAW_DASH;
	if (u == '-' || u == RB_CH_NDASH) {
	    MBuf_truncate(mb, mb->totalLen - 1);
	    MBuf_putc(mb, RB_CH_MDASH);
	    return 1;
	}
	break;
      case RBP_SAW_DOT1:
	if (u == '.') {
	    pg->parseFlags |= RBP_SAW_DOT2;
	    return 0;
	}
	if (ISSPACE(u))
	    return 0;
	pg->parseFlags &= ~RBP_SAW_DOT1;
	break;
      case RBP_SAW_DOT1|RBP_SAW_DOT2:
	if (u == '.') {
	    MBuf_truncate(mb, pg->pre_ellipsis_len);
	    MBuf_putc(mb, RB_CH_ELIPS);
	    pg->parseFlags &= ~(RBP_SAW_DOT1|RBP_SAW_DOT2);
	    pg->parseFlags |= RBP_SPACE_OK_HERE;
	    return 1;
	}
	if (ISSPACE(u))
	    return 0;
	pg->parseFlags &= ~(RBP_SAW_DOT1|RBP_SAW_DOT2);
	break;
    }
    return 0;
}

static char *
urlEscapeChar(int ch)
{
    static char buf[8];
    int hi = ch / 16, lo = ch % 16;
    sprintf(buf, "%%%c%c", hi + (hi > 9? 'A'-10 : '0'),
			   lo + (lo > 9? 'A'-10 : '0'));
    return buf;
}

static void
removeAmpersandEntities(char *bp)
{
    char *t, *f;
    for (f = bp; ISSPACE(*f); f++) {}
    for (t = bp; *f; f++) {
	if (*f == '&') {
	    switch (f[1]) {
	      case 'a':
		if (strnEQ(f+1,"amp;",4))
		    *t++ = '&';
		break;
	      case 'l':
		if (strnEQ(f+1,"lt;",3))
		    *t++ = '<';
		break;
	      case 'g':
		if (strnEQ(f+1,"gt;",3))
		    *t++ = '>';
		break;
	    }
	    while (*++f && *f != ';') {}
	}
	else
	    *t++ = *f;
    }
    while (t != bp && ISSPACE(t[-1])) t--;
    *t = '\0';
}
