#!/usr/bin/gawk -f

#
# Copyright(C) 2007-2012 National Institute of Information and Communications Technology
#

# CRF++νϤ*.tsvľ
# ΥեEUC-JP¸뤳

BEGIN {
  FS = "\t";
  OFS = "\t";

  if (ARGC < 2) {
    printf "usage: %s <*.tsv file> [<*.otag>]\n", SCRIPT > "/dev/stderr";
    EXIT = 1;
    exit EXIT;
  }
  tsvfile = ARGV[1];
  ARGV[1] = "";

  buf = "";
  cnt = 0;
  sen = "";
  extnum = 0;
}

{
#Կ򥫥
  nlines++;

  if (buf != "" && $7 != "I") {
    vec[cnt++] = buf;
    buf = "";
  }

  if ($0 == "") {
    extent[extnum] = "";
    for (i = 0; i < cnt; i++) extent[extnum] = extent[extnum] "\\n" vec[i];
    extent[extnum] = substr(extent[extnum], 3);
    extsen[extnum] = sen;
    extnum++;

    cnt = 0;
    sen = "";
  } else {
    if ($7 == "B") buf = $1;
    if ($7 == "I") buf = buf $1;
    sen = sen $1;
  }
}

END {
  if (EXIT != "") exit EXIT;

  for (y = 0; ; ) {
    r = getline < tsvfile;
    if (r == 0) break;
    if (r < 0) error("file I/O error");
    if (extsen[y] != $5) {
      showError("sentence mismatch");
      printf " = Line:%d\n", nlines > "/dev/stderr";
     }
    if (extent[y] == "") {
      $6 = "";
      $7 = "";
      $8 = "";
      $9 = "";
    } else {
      tmp = extent[y];
      gsub(/\\n/, "\t", tmp);
      gsub(/[^\t]+/, "+1", tmp);
      gsub(/\t/, "\\n", tmp);
      $6 = tmp;
      tmp = extent[y];
      gsub(/\\n/, "\t", tmp);
      gsub(/[^\t]+/, "[]", tmp);
      gsub(/\t/, "\\n", tmp);
      $7 = tmp;
      $8 = extent[y];
      tmp = extent[y];
      gsub(/\\n/, "\t", tmp);
      gsub(/[^\t]+/, "", tmp);
      gsub(/\t/, "\\n", tmp);
      $9 = tmp;
    }
    y++;
    print;
  }
  if (y != extnum) error("#entities mismatch");
}
