// ===========================================================================
// File: "aidaEncoding.c"
//                        Created: 2010-08-09 21:56:18
//              Last modification: 2013-12-20 13:37:46
// Author: Bernard Desgraupes
// e-mail: <bdesgraupes@users.sourceforge.net>
// (c) Copyright: Bernard Desgraupes 2010-2013
// All rights reserved.
// ===========================================================================

#include "aidaMain.h"

// TCL_UTF_MAX + 1 (=4) is not enough
#define AIDA_OUTBUF_SIZE   16

static bool sInitTranscode = false;



// ------------------------------------------------------------------------
// 
// "aida_init_encodings" --
// 
// Check if there are environment variables setting the encodings
// (AIDA_INPUT_ENCODING or AIDA_OUTPUT_ENCODING). This can be overridden
// later by the -from or the -to options.
// 
// ------------------------------------------------------------------------
int
aida_init_encodings()
{
	int				result = TCL_OK;
	char *			str;
	
	if (gEncodings == NULL) {
		// Allocate gEncodings
		gEncodings = (aida_enc_t*)malloc(sizeof(aida_enc_t));
		if (gEncodings == NULL) {
			aida_abort("can't allocate memory for encodings");
		} 
		memset(gEncodings, 0, sizeof(aida_enc_t));
	} 
	
	// Look for an AIDA_INPUT_ENCODING environment variable
	str = getenv("AIDA_INPUT_ENCODING");
	if (str != NULL) {
		result = aida_encoding_name(str, true);
	} 
	
	if (result != TCL_OK) {
		aida_print_err("failed to register environment variable AIDA_INPUT_ENCODING\n");
	} else {
		// Look for an AIDA_OUTPUT_ENCODING environment variable
		str = getenv("AIDA_OUTPUT_ENCODING");
		if (str != NULL) {
			result = aida_encoding_name(str, false);
		} 
		if (result != TCL_OK) {
			aida_print_err("failed to register environment variable AIDA_OUTPUT_ENCODING\n");
		}
	}

	return result;
}


// ------------------------------------------------------------------------
// 
// "aida_free_encodings" --
// 
// ------------------------------------------------------------------------
void
aida_free_encodings()
{
	if (gEncodings != NULL) {
		if (gEncodings->inam != NULL) {
			free(gEncodings->inam);
		} 
		if (gEncodings->ienc != NULL) {
			Tcl_FreeEncoding(gEncodings->ienc);
		} 
		if (gEncodings->onam != NULL) {
			free(gEncodings->onam);
		} 
		if (gEncodings->oenc != NULL) {
			Tcl_FreeEncoding(gEncodings->oenc);
		} 
		free(gEncodings);
	} 	
}


// ------------------------------------------------------------------------
// 
// "aida_encoding_name" --
// 
// Register an encoding name. The 'isInput' arg tells if it is the input or the
// output encoding. Return a standard Tcl result.
// 
// This function just registers the name of the encoding, it does not
// install it. Installation of the encoding struct is done by
// aida_set_encoding().
// 
// ------------------------------------------------------------------------
int
aida_encoding_name(const char * inEnc, bool isInput)
{    
	int		result = TCL_OK;
	
	if (isInput) {
		if (gEncodings->inam != NULL) {
			free(gEncodings->inam);
		} 
		gEncodings->inam = strdup(inEnc);
		if (gEncodings->inam == NULL) {
			result = TCL_ERROR;
		} 
	} else {
		if (gEncodings->onam != NULL) {
			free(gEncodings->onam);
		} 
		gEncodings->onam = strdup(inEnc);
		if (gEncodings->onam == NULL) {
			result = TCL_ERROR;
		} 
	}
	
	return result;
}


// ------------------------------------------------------------------------
// 
// "aida_install_encodings" --
// 
// Install the encodings.
// 
// ------------------------------------------------------------------------
int
aida_install_encodings()
{
	int				result = TCL_OK;
	
	// Install the input encoding
	if (gEncodings->inam != NULL) {
		result = aida_set_encoding(gEncodings->inam, true);
	} else {
		// The input encoding name is needed by aida_single_encoding()
		gEncodings->inam = strdup(Tcl_GetEncodingName(gEncodings->ienc));
	}
	
	// Install the output encoding
	if (result == TCL_OK) {
		if (gEncodings->onam != NULL) {
			aida_set_encoding(gEncodings->onam, false);
		} 
	} 
	
	if (gEncodings->ienc == NULL) {
		// If the input encoding is not specified, use the system encoding
		// and check if it is utf8
		if (!strcmp(Tcl_GetEncodingName(NULL), "utf-8")) {
			gEncodings->utf8 = true;
		} 
	} 
	
	// Find if the input encoding is multi-byte
	gEncodings->sing = aida_single_encoding(gEncodings->inam);
	
	return result;
}
			

// ------------------------------------------------------------------------
// 
// "aida_set_encoding" --
// 
// Specify the encoding. The 'isInput' arg tells if it is the input or the
// output encoding. Return a standard Tcl result.
// 
// ------------------------------------------------------------------------
int
aida_set_encoding(const char * inEnc, bool isInput)
{
	int				result = TCL_OK;
	Tcl_Encoding	enc;
	
	if (!strcmp(inEnc,"utf-8") || !strcmp(inEnc,"utf8")) {
		enc = Tcl_GetEncoding(gInterp, "utf-8");
		gEncodings->utf8 = true;
	} else if (!strcmp(inEnc,"utf-16") || !strcmp(inEnc,"utf16")) {
		enc = Tcl_GetEncoding(gInterp, "unicode");
	} else {
		enc = Tcl_GetEncoding(gInterp, inEnc);
	} 
	
	if (enc == NULL) {
		aida_print_err("unknown encoding '%s'\n", inEnc);
		result = TCL_ERROR;
	} else {
		if (isInput) {
			if (gEncodings->ienc != NULL) {
				Tcl_FreeEncoding(gEncodings->ienc);
			} 
			gEncodings->ienc = enc;
		} else {
			if (gEncodings->oenc != NULL) {
				Tcl_FreeEncoding(gEncodings->oenc);
			} 
			gEncodings->oenc = enc;
		} 
	}
	
	return result;
}


// ------------------------------------------------------------------------
// 
// "aida_single_encoding" --
// 
// Tell if the encoding is single-byte.
// 
// ------------------------------------------------------------------------
bool
aida_single_encoding(const char * inEnc)
{
	int			i, objc, result, val;
	bool		isOK = true;
    Tcl_Obj *	objv[2];
    
	objc = 2;
	objv[0] = Tcl_NewStringObj("aida::singleEncoding", -1);
	objv[1] = Tcl_NewStringObj(inEnc, -1);
	for (i = 0; i < objc; i++) Tcl_IncrRefCount(objv[i]);
	result = Tcl_EvalObjv(gInterp, objc, objv, TCL_EVAL_GLOBAL);
	for (i = 0; i < objc; i++) Tcl_DecrRefCount(objv[i]);

	if (result == TCL_OK) {
		result = Tcl_GetBooleanFromObj(gInterp, Tcl_GetObjResult(gInterp), &val);
		isOK = (val != 0);
	} 
	aida_assert_result(result);
	
	return isOK;
}


// ------------------------------------------------------------------------
// 
// "aida_transcodeInput" --
// 
// Translate the input file to UTF-8 in case of multibyte encoding. The
// translated file is written in the temporary location like all the
// fragment files.
// 
// The char* pointer stored in the outFile argument is allocated either by
// tmpnam or by strdup. It is the responsibility of the caller to free it.
// 
// Transcoding means transform the input files (original file and possible
// includes) from the input encoding (when it is a multi-byte encoding,
// such as UTF-16, Big5, shiftjis, etc.) to UTF-8 so that all the inputs
// are then parsed as UTF-8.
// 
// If transcoding occurs, then the input encoding is changed to UTF-8.
// 
// ------------------------------------------------------------------------
int
aida_transcodeInput(char * inFile, bool * outTemp, char ** outFile)
{
	int				fd, len, result = TCL_OK;
	char *			dstFile = NULL;
	char *			prefix = "aidatmp_tran_XXXXXX";
	char *			template;
	bool			first = true, checkbom = false;
    Tcl_Channel		ichan, ochan;
	Tcl_Obj 		*objPtr, *inObj, *outObj;
	
	// If it is a multi-byte encoding, convert the file to Utf-8
	if (!gEncodings->sing) {
		aida_verbose(2, "transcoding input file '%s'\n", inFile);
		
		if (!sInitTranscode) {
			// The original input encoding is transferred to transcoding
			gEncodings->tenc = gEncodings->ienc;
			// The input encoding is now set to UTF-8
			aida_set_encoding("utf-8", true);	
		} 
		sInitTranscode = true;
		
		// Get the name of a temporary file
		len = strlen(P_tmpdir)+strlen(prefix);
		template = (char*)malloc(len+2);
		sprintf(template,"%s/%s",P_tmpdir,prefix);
		template[len]=0;
		fd = mkstemp(template);
		if (fd < 0) {
			aida_abort("can't create temporary transcode file\n");
		} else {
			close(fd);
		}
		aida_verbose(3, "created temporary transcode file '%s'\n", template);
		dstFile = template;
		
		// Open channels for the input and output files and set the
		// encodings
		inObj = Tcl_NewStringObj(inFile, -1);
		Tcl_IncrRefCount(inObj);
		
		ichan = Tcl_FSOpenFileChannel(gInterp, inObj, "r", 0644);
		if (ichan == NULL) {
			return TCL_ERROR;
		}
		outObj = Tcl_NewStringObj(dstFile, -1);
		Tcl_IncrRefCount(outObj);
		ochan = Tcl_FSOpenFileChannel(gInterp, outObj, "w", 0666);
		if (ochan == NULL) {
			return TCL_ERROR;
		}
		Tcl_DecrRefCount(inObj);
		Tcl_DecrRefCount(outObj);
		
		// Set the encodings
		Tcl_SetChannelOption(NULL, ichan, "-encoding", Tcl_GetEncodingName(gEncodings->tenc));
		Tcl_SetChannelOption(NULL, ochan, "-encoding", "utf-8");
		
		// Must we check for the presence of a BOM?
		if (!strcmp("unicode", Tcl_GetEncodingName(gEncodings->tenc))) {
			checkbom = true;
		} 
		
		// Read the input line by line and copy to the output
		objPtr = Tcl_NewObj();
		Tcl_IncrRefCount(objPtr);
		while (!Tcl_Eof(ichan)) {
			Tcl_GetsObj(ichan, objPtr);
			if (first && checkbom) {
				if (aida_has_BOM(objPtr)) {
					// Strip the BOM from the output
					len = Tcl_GetCharLength(objPtr);
					if (len > 1) {
						Tcl_WriteObj(ochan, Tcl_GetRange(objPtr, 1, len));
						Tcl_WriteChars(ochan, "\n", 1);
					} 
				} 
				first = false;				
			} else {
				Tcl_WriteObj(ochan, objPtr);
				Tcl_WriteChars(ochan, "\n", 1);
			} 
			Tcl_SetObjLength(objPtr,0);
		}
		Tcl_DecrRefCount(objPtr);
		
		// Close the channels
		Tcl_Close(gInterp, ichan);
		Tcl_Close(gInterp, ochan);

		// Parse as UTF-8
		gEncodings->utf8 = true;
	} else {
		dstFile = strdup(inFile);
	} 
	
	*outTemp = !(gEncodings->sing);
	*outFile = dstFile;
	
	return result;
}


// ------------------------------------------------------------------------
// 
// "aida_has_BOM" --
// 
// Tell if the first two bytes correspond to the Byte Order Mark (U+FEFF).
// Take the endianness into account. In Utf-8: EFBBBF.
// 
// ------------------------------------------------------------------------
bool
aida_has_BOM(Tcl_Obj * inObj)
{
	bool			hasbom = false, little;
	Tcl_UniChar		uch;
	char * 			p = (char*)&uch;
	int				val[2];
    union {
		char	c[sizeof(short)];
		short	s;
    } order;

	// Find the endianness
	order.s = 1;
	little = (order.c[0] == 1);

	// Get the first Unicode char
	uch = Tcl_GetUniChar(inObj, 0);
	val[0] = p[0]&0xff;
	val[1] = p[1]&0xff;
	
	if ( (little && (val[0] == 0xff) && (val[1] == 0xfe))
		|| (!little && (val[0] == 0xff) && (val[1] == 0xfe)) ) {
		hasbom = true;
	} 
		
	return hasbom;
}


// ------------------------------------------------------------------------
// 
// "aida_externalToTclObj" --
// 
// Create a Tcl_Obj out of a string given in the input encoding. If
// gEncodings->ienc is NULL, the system encoding is used by default by
// Tcl_ExternalToUtfDString.
// 
// ------------------------------------------------------------------------
Tcl_Obj * 
aida_externalToTclObj(char * inStr)
{
	Tcl_Obj *	outObj;
	
	Tcl_DString		ds;
	Tcl_DStringInit(&ds);
	Tcl_ExternalToUtfDString(gEncodings->ienc, inStr, -1, &ds);
	outObj = Tcl_NewStringObj(Tcl_DStringValue(&ds), Tcl_DStringLength(&ds));
	Tcl_DStringFree(&ds);
	
	return outObj;	
}


// ------------------------------------------------------------------------
// 
// "aida_externalCharsToTclObj" --
// 
// Create a Tcl_Obj out of a DString containing a few bytes. If there are
// not enough bytes to convert to utf-8 and build a Tcl_Obj, then NULL is
// returned and the function will have to be called again with more bytes.
// The conversion relies on the Tcl_ExternalToUtf() function.
// 
// The status returned in the 'outErr' pointer can be TCL_OK,
// TCL_CONVERT_MULTIBYTE, or TCL_CONVERT_NOSPACE. We could also get
// TCL_CONVERT_SYNTAX, or TCL_CONVERT_UNKNOWN status if the 'flags'
// argument contained TCL_ENCODING_STOPONERROR, but we don't use this : in
// case of an unknown character, a '?' is substituted.
// 
// ------------------------------------------------------------------------
Tcl_Obj * 
aida_externalCharsToTclObj(char * inStr, int inLen, int *outErr)
{
	int					flags, srcRead, dstWrote, dstChars, result;
	Tcl_EncodingState	state;
	Tcl_Obj *			outObj = NULL;
	char				outBuf[AIDA_OUTBUF_SIZE];
// 	char				bom[3] = {0xef, 0xbb, 0xbf};

	if (gEncodings->utf8 && !Tcl_UtfCharComplete(inStr, inLen)) {
		*outErr = TCL_CONVERT_MULTIBYTE;
		return NULL;
	} 
	
	flags = TCL_ENCODING_START | TCL_ENCODING_END;

// 	if (!strncmp(inStr,bom,3)) {
// 		aida_verbose(1,"ignoring BOM\n");
// 		*outErr = TCL_OK;
// 		return NULL;
// 	} 
	
	result = Tcl_ExternalToUtf(gInterp, gEncodings->ienc, 
							inStr, inLen,
							flags, &state, outBuf, AIDA_OUTBUF_SIZE, 
							&srcRead, &dstWrote, &dstChars);
		
	if (result == TCL_OK) {
		outObj = Tcl_NewStringObj(outBuf, dstWrote);		
	} 
	
	*outErr = result;
	return outObj;	
}


// ------------------------------------------------------------------------
// 
// "aida_TclObjToExternalDString" --
// 
// Convert the string representation of a Tcl_Obj to the specified
// encoding. The DString is provided by the caller.
// 
// ------------------------------------------------------------------------
char *
aida_TclObjToExternalDString(Tcl_Obj * inObj, Tcl_Encoding inExtEnc, Tcl_DString *outDStr)
{
	int				len = 0;
	
	Tcl_UtfToExternalDString(inExtEnc, Tcl_GetStringFromObj(inObj, &len), -1, outDStr);
	
	return Tcl_DStringValue(outDStr);	
}



