/*
  Copyright(C) 2007-2012 National Institute of Information and Communications Technology
*/

/*
  svmtools
  Feature vector module
*/


#include <stdio.h>
#include <stdlib.h>
#include <ctype.h>
#include <string.h>
#include "exception.h"
#include "hashtable.h"
#include "split.h"
#include "svm_common.h"
#include "svm_kernel.h"
#include "svm_fv.h"


#define BUF_SIZE (32 * 1024)


static void sortsv(SVM_SV *sv);
static int gettkn(char *buf, int size, FILE *fp);


/*
  *.ft$B%U%!%$%k$rFI$_9~$`(B
*/
SVM_FT *svm_fv_readft(FILE *fp) {
  SVM_FT *ft;
  int buf_size;
  char *buf;
  int len;

  /* $B%a%b%j3NJ](B & $B=i4|2=(B */
  ft = sdb_new();
  if (ft == NULL) return NULL;
  buf_size = BUF_SIZE;
  buf = smalloc(sizeof(char) * buf_size);

  /* $B%i%Y%k$NFI$_9~$_(B */
  for (; ; ) {
    if (fgets(buf, buf_size, fp) == NULL) break;
    while (buf[(len = strlen(buf)) - 1] != '\n') {
      buf_size = 2 * buf_size + 1;
      buf = srealloc(buf, sizeof(char) * buf_size);
      if (fgets(buf + len, buf_size - len, fp) == NULL) {
	free(buf);
	sdb_delete(ft);
	return NULL;
      }
    }
    buf[--len] = '\0';
    /* $BEPO?(B */
    if (sdb_put(ft, buf)) {
      free(buf);
      sdb_delete(ft);
      return NULL;
    }
  }
  free(buf);

  return ft;
}


/*
  *.ft$B%U%!%$%k$r=q$-9~$`(B
*/
int svm_fv_writeft(SVM_FT *ft, FILE *fp) {
  int i;
  int size;

  size = sdb_size(ft);
  for (i = 0; i < size; i++) {
    fprintf(fp, "%s\n", sdb_str(ft, i));
  }

  return 0;
}


/*
  fv$B7A<0$r(Bsv$B7A<0$KJQ49(B
  sv$B$N%a%b%j$O3NJ]:Q$H2>Dj(B
  sum=1$B$J$i=EJ#$9$kAG@-$,9g$C$?>l9g$KAG@-CM$r2C;;!%$=$l0J30$O(Buniq$B$9$k(B
  $B$?$@$7!$(Bbinary feature$B%b!<%I$N>l9g$O(Bsum$B%*%W%7%g%s$OL58z(B
*/
int svm_fv_fv2sv(SVM_FT *ft, SVM_FV *fv, SVM_SV *sv, int add, int sum) {
  int i;

  /* feature$B$r(Bid$B$KJQ49$7$FJ];}(B */
  sv->num = 0;
  if (fv->val == NULL) sv->val = NULL;
  for (i = 0; i < fv->num; i++) {
    int idx;

    idx = sdb_id(ft, fv->ftr[i], add) + 1;
    if (idx != 0) {
      sv->idx[sv->num] = idx;
      if (fv->val != NULL) sv->val[sv->num] = fv->val[i];
      sv->num++;
    }
  }

  /* $B%=!<%H(B */
  sortsv(sv);

  /* $B=EJ#$9$kAG@-$r:o=|(B */
  if (sum == 0 || sv->val == NULL) {
    for (i = 0; i < sv->num - 1; i++) {
      if (sv->idx[i] == sv->idx[i + 1]) {
	int j;

	for (j = i + 1; j < sv->num - 1; j++) {
	  sv->idx[j] = sv->idx[j + 1];
	  if (sv->val != NULL) sv->val[j] = sv->val[j + 1];
	}
	sv->num--;
	i--;
      }
    }
  } else {
    for (i = 0; i < sv->num - 1; i++) {
      if (sv->idx[i] == sv->idx[i + 1]) {
	int j;

	sv->val[i] += sv->val[i + 1];
	for (j = i + 1; j < sv->num - 1; j++) {
	  sv->idx[j] = sv->idx[j + 1];
	  if (sv->val != NULL) sv->val[j] = sv->val[j + 1];
	}
	sv->num--;
	i--;
      }
    }
  }

  return 0;
}


/*
  sv$B$r(Bfv$B$KJQ49$9$k(B
  fv$B$N%a%b%j$O3NJ]:Q$H2>Dj(B
*/
int svm_fv_sv2fv(SVM_FT *ft, SVM_SV *sv, SVM_FV *fv) {
  int i;

  /* $BJQ49(B */

  if (sv->val == NULL) fv->val = NULL;
  for (i = 0; i < sv->num; i++) {
    fv->ftr[i] = sdb_str(ft, sv->idx[i] - 1);
    if (sv->val != NULL) fv->val[i] = sv->val[i];
  }
  fv->num = sv->num;

  return 0;
}


/*
  fv$B7A<0$N%b%G%k%U%!%$%k$rFI$_9~$`(B
*/
SVM_MDL *svm_fv_readmdl(SVM_FT **ft, FILE *fp) {
  int i;
  int buf_size;
  char *buf;
  int nelem;
  char **elem;
  int mdl_size;
  SVM_MDL *mdl;
  int line;
  int isbinary;
  double beta;
  int sv_size;
  int sv_len;
  int sv_num;
  int *sv_idx;
  float *sv_val;
  char *s;
  int len;
  int p_ktype;
  float p_d, p_g, p_s, p_r;
  int num;
  double b;

  /* $B=i4|2=(B */
  if (*ft == NULL) {
    *ft = (SVM_FT *)sdb_new();
    if (*ft == NULL) return NULL;
  }

  /* $B%a%b%j3NJ](B */
  mdl = smalloc(sizeof(SVM_MDL));
  buf_size = BUF_SIZE;
  buf = smalloc(sizeof(char) * buf_size);
  elem = smalloc(sizeof(char *) * (buf_size / 2));
  sv_size = 0;
  sv_len = 0;
  sv_idx = NULL;
  sv_val = NULL;

  /* ID */
  if (fgets(buf, buf_size, fp) == NULL) {
    if (svm_verbose) fprintf(stderr, "<%d: file read error> ", 1);
    return NULL;
  }
  if (strncmp("SVM-light Version ", buf, 18) != 0) {
    if (svm_verbose) fprintf(stderr, "<wrong ID> ");
    return NULL;
  }

  /* $B%Q%i%a!<%?(B */
  fscanf(fp,"%d%*[^\n]\n", &p_ktype);
  fscanf(fp,"%f%*[^\n]\n", &p_d);
  fscanf(fp,"%f%*[^\n]\n", &p_g);
  fscanf(fp,"%f%*[^\n]\n", &p_s);
  fscanf(fp,"%f%*[^\n]\n", &p_r);
  fscanf(fp,"%*s%*[^\n]\n");
  fscanf(fp,"%d%*[^\n]\n", &num);
  fscanf(fp,"%lf%*[^\n]\n", &b);

  /* $BJQ49(B */
  if (p_ktype >= 4) {
    if (svm_verbose) fprintf(stderr, "<unsupported kernel type> ");
    return NULL;
  }
  mdl->kprm.ktype = p_ktype;
  mdl->kprm.degree = p_d;
  mdl->kprm.gamma = (p_ktype == 2) ? p_g : p_s;
  mdl->kprm.coef = p_r;
  mdl->b = -b;

  /* $B%a%b%j3NJ](B */
  mdl_size = num - 1;
  if (mdl_size > 0) {
    mdl->beta = smalloc(sizeof(double) * mdl_size);
    mdl->sv = smalloc(sizeof(SVM_SV *) * mdl_size);
    mdl->sv[0] = smalloc(sizeof(SVM_SV) * mdl_size);
    for (i = 1; i < mdl_size; i++) mdl->sv[i] = mdl->sv[i - 1] + 1;
    mdl->reg = smalloc(sizeof(float) * mdl_size);
  }
  mdl->num = 0;

  /* $B%5%]!<%H%Y%/%?!<$NFI$_9~$_(B */
  isbinary = 1;
  for (line = 10; line - 10 < num - 1; line++) {
    /* $B%Y%/%H%k$NFI$_9~$_(B($B%a%b%j$NF0E*3NJ](B) */
    if (fgets(buf, buf_size, fp) == NULL) break;
    while (buf[(len = strlen(buf)) - 1] != '\n') {
      buf_size = 2 * buf_size + 1;
      buf = srealloc(buf, sizeof(char) * buf_size);
      elem = srealloc(elem, sizeof(char *) * (buf_size / 2));
      if (fgets(buf + len, buf_size - len, fp) == NULL) {
	if (svm_verbose) fprintf(stderr, "<%d: unexpected EOF> ", line);
	return NULL;
      }
    }
    buf[--len] = '\0';
    if (len > 0 && buf[len - 1] == '\r') buf[--len] = '\0';
    if (len > 0 && buf[len - 1] == ' ') buf[--len] = '\0';

    /* $BJ,3d(B */
    nelem = split(buf, ' ', elem, buf_size / 2);

    /* $B%5%]!<%H%Y%/%?!<$N=E$_(B */
    beta = atof(elem[0]);

    /* $B%Y%/%H%k(B */
    while (sv_len + nelem - 1 > sv_size) {
      sv_size = 2 * sv_size + 1;
      sv_idx = srealloc(sv_idx, sizeof(int) * sv_size);
      sv_val = srealloc(sv_val, sizeof(float) * sv_size);
      if (mdl->num > 0) {
	mdl->sv[0]->idx = sv_idx;
	mdl->sv[0]->val = sv_val;
      }
      for (i = 1; i < mdl->num; i++) {
	mdl->sv[i]->idx = mdl->sv[i - 1]->idx + mdl->sv[i - 1]->num;
	mdl->sv[i]->val = mdl->sv[i - 1]->val + mdl->sv[i - 1]->num;
      }
    }
    for (i = 1, sv_num = 0; i < nelem; i++) {
      s = rindex(elem[i], ':');
      if (s == NULL) {
	if (svm_verbose) fprintf(stderr, "<%d: invalid data (no colon)> ", line);
	return NULL;
      }

      *(s++) = '\0';
      (sv_val + sv_len)[sv_num] = (float)atof(s);
      if ((sv_val + sv_len)[sv_num] == 0.0f) continue;
      if ((sv_val + sv_len)[sv_num] != 1.0f) isbinary = 0;
      (sv_idx + sv_len)[sv_num] = sdb_id(*ft, elem[i], 1) + 1;
      if ((sv_idx + sv_len)[sv_num] == 0) {
	if (svm_verbose) fprintf(stderr, "<%d: string to id conversion failed> ", line);
	return NULL;
      }
      sv_num++;
    }

    /* $B%3%T!<(B */
    mdl->beta[mdl->num] = beta;
    mdl->sv[mdl->num]->num = sv_num;
    mdl->sv[mdl->num]->idx = sv_idx + sv_len;
    mdl->sv[mdl->num]->val = sv_val + sv_len;
    sortsv(mdl->sv[mdl->num]);
    sv_len += sv_num;
    mdl->num++;
  }

  /* $B%a%b%j$N@Z$j5M$a(B */
  if (sv_len > 0) {
    sv_size = sv_len;
    sv_idx = srealloc(sv_idx, sizeof(int) * sv_size);
    sv_val = srealloc(sv_val, sizeof(float) * sv_size);
    if (mdl->num > 0) {
      mdl->sv[0]->idx = sv_idx;
      mdl->sv[0]->val = sv_val;
    }
    for (i = 1; i < mdl->num; i++) {
      mdl->sv[i]->idx = mdl->sv[i - 1]->idx + mdl->sv[i - 1]->num;
      mdl->sv[i]->val = mdl->sv[i - 1]->val + mdl->sv[i - 1]->num;
    }
  }

  /* $B%a%b%j3+J|(B */
  free(buf);
  free(elem);

  /* binary vector$B$+$I$&$+$NH=Dj$H(Bbinary$B$X$NJQ49=hM}(B */
  if (isbinary) {
    free(sv_val);
    for (i = 0; i < mdl->num; i++) {
      mdl->sv[i]->val = NULL;
    }
  }

  /* $B:n6HMQ%G!<%?$N=i4|2=(B */
  if (svm_invidx) {
    mdl->kwrk = svm_knl_kwrkinit_inv(&mdl->kprm, mdl->num, mdl->sv);
  } else {
    mdl->kwrk = svm_knl_kwrkinit(&mdl->kprm, mdl->num, mdl->sv);
  }

  return mdl;
}


/*
  fv$B7A<0$N%b%G%k%U%!%$%k$r=q$-9~$`(B
*/
int svm_fv_createmdl(SVM_FT *ft, SVM_KPRM *kprm, SVM_EXM *exm, double *alpha, double b, FILE *fp) {
  int i, j;
  int num;

  /* $B%5%]!<%H%Y%/%?!<?t$N%+%&%s%H(B */
  num = 0;
  for (i = 0; i < exm->num; i++) if (alpha[i] > 0.0) num++;
  if (svm_verbose) fprintf(stderr, "(#SV=%d) ", num);

  /* $B%b%G%k=q$-9~$_(B */
  fprintf(fp, "SVM-light Version V3.02\n");
  fprintf(fp,"%d # kernel type\n", kprm->ktype);
  fprintf(fp,"%.7g # kernel parameter -d \n", kprm->degree);
  fprintf(fp,"%.7g # kernel parameter -g \n", kprm->gamma);
  fprintf(fp,"%.7g # kernel parameter -s \n", kprm->gamma);
  fprintf(fp,"%.7g # kernel parameter -r \n", kprm->coef);
  fprintf(fp,"%s # kernel parameter -u \n", "empty");
  fprintf(fp,"%d # number of support vectors \n", num + 1);
  fprintf(fp,"%.7g # threshold b \n", -b);
  for (i = 0; i < exm->num; i++) {
    if (alpha[i] > 0.0) {
      fprintf(fp,"%.7g", alpha[i] * exm->label[i]);
      for (j = 0; j < exm->sv[i]->num; j++) {
	fprintf(fp," %s:%.7g", sdb_str(ft, exm->sv[i]->idx[j] - 1), (exm->sv[i]->val == NULL) ? 1.0 : exm->sv[i]->val[j]);
      }
      fprintf(fp,"\n");
    }
  }

  return 0;
}


/*
  $B%Y%/%H%k$N%$%s%G%C%/%9$rC1=c$K>:=g$K%=!<%H(B
*/
static void sortsv(SVM_SV *sv) {
  int i, j;

  if (sv->val != NULL) {
    for (i = 1; i < sv->num; i++) {
      int tmp_idx;
      float tmp_val;

      tmp_idx = sv->idx[i];
      tmp_val = sv->val[i];
      for (j = i; j > 0 && sv->idx[j - 1] > tmp_idx; j--) {
	sv->idx[j] = sv->idx[j - 1];
	sv->val[j] = sv->val[j - 1];
      }
      sv->idx[j] = tmp_idx;
      sv->val[j] = tmp_val;
    }
  } else {
    for (i = 1; i < sv->num; i++) {
      int tmp_idx;

      tmp_idx = sv->idx[i];
      for (j = i; j > 0 && sv->idx[j - 1] > tmp_idx; j--) {
	sv->idx[j] = sv->idx[j - 1];
      }
      sv->idx[j] = tmp_idx;
    }
  }

  return;
}


/*
  $B;vNc$rFI$_9~$`(B($B%a%b%j3NJ]$N8zN(2=$N$?$a%P%C%U%!$r<u$1EO$9(B)
  $BLa$jCM(B: 0=$B;vNc$rFI$_9~$s$@(B, 1=$B%(%i!<(B, -1=EOF
*/
int svm_fv_readvec(SVM_FT *ft, FILE *fp, int *label, SVM_SV *svbuf, int *svbuf_size, SVM_SV *sv) {
  int i;
  char buf[BUF_SIZE];
  int isbinary;

  isbinary = 1;
  if (gettkn(buf, BUF_SIZE, fp)) return -1;
  *label = atoi(buf);
  for (svbuf->num = 0; ; ) {
    char *idx, *val;

    if (gettkn(buf, BUF_SIZE, fp)) break;

    idx = buf;
    val = rindex(buf, ':');
    if (val == NULL) {
      if (svm_verbose) fprintf(stderr, "<invalid data (no colon) [%s]> ", buf);
      return 1;
    }
    *val++ = '\0';

    if (svbuf->num + 1 > *svbuf_size) {
      *svbuf_size = 2 * *svbuf_size + 1;
      svbuf->idx = srealloc(svbuf->idx, sizeof(int) * *svbuf_size);
      svbuf->val = srealloc(svbuf->val, sizeof(float) * *svbuf_size);
    }

    svbuf->val[svbuf->num] = (float)atof(val);
    if (svbuf->val[svbuf->num] == 0.0f) continue;
    if (svbuf->val[svbuf->num] != 1.0f) isbinary = 0;
    svbuf->idx[svbuf->num] = sdb_id(ft, idx, 0) + 1;
    if (svbuf->idx[svbuf->num] == 0) continue;
    svbuf->num++;
  }

  /* $B%G!<%?:n@.(B */
  sv->num = svbuf->num;
  sv->idx = svbuf->idx;
  sv->val = svbuf->val;

  /* binary vector$B$+$I$&$+$NH=Dj$H(Bbinary$B$X$NJQ49=hM}(B */
  if (isbinary) {
    sv->val = NULL;
  }

  /* $B%=!<%H(B */
  sortsv(sv);

  /* $B=EJ#$9$kAG@-$r:o=|(B */
  for (i = 0; i < sv->num - 1; i++) {
    if (sv->idx[i] == sv->idx[i + 1]) {
      int j;

      for (j = i + 1; j < sv->num - 1; j++) {
	sv->idx[j] = sv->idx[j + 1];
	if (sv->val != NULL) sv->val[j] = sv->val[j + 1];
      }
      sv->num--;
      i--;
    }
  }

  return 0;
}


/*
  $B%H!<%/%s$r(B1$B$DFI$_9~$s$GJV$9(B
  $BLa$jCM(B: 0=$B%H!<%/%s$rFI$_9~$s$@(B, 1=EOS, -1=EOF
*/
static int gettkn(char *buf, int size, FILE *fp) {
  int i;
  int c;

  c = getc(fp);
  if (c == '\r') c = getc(fp);
  if (c == '\n') return 1;
  if (c == EOF) return -1;

  for (i = 0; i < size - 1; i++) {
    if (c == '\r' || c == '\n') {
      ungetc(c, fp);
      break;
    }
    if (c == ' ') break;
    buf[i] = c;
    c = getc(fp);
  }
  buf[i] = '\0';

  return 0;
}
