#!/usr/bin/gawk -f

#
# Copyright(C) 2007-2012 National Institute of Information and Communications Technology
#

# *.tsvΥǡCRF++ϥեޥåȤѴ
# ɾɽСåפǽ˽񤫤줿ɾɽͥ

BEGIN {
  FS = "\t";

  if (ARGC < 2) {
    printf "usage: %s <dictionary> [<*.tsv>]\n", SCRIPT > "/dev/stderr";
    EXIT = 1;
    exit EXIT;
  }
  dicfile = ARGV[1];
  ARGV[1] = "";

  # ɤ߹(w/ ǲ)+ȥ饤˳Ǽ
  ndic = dicread(dicfile, dicw, dicp, trie_flg, trie_val);
}

{
#Կ򥫥
  nlines++;

  sen = $5;
  xprall = $8;
  mrp = $10;

  nxpr = split(xprall, xprelem, /\\n/);

  n = ma(mrp, surf, base, cpos, fpos);

  # Ϳ(trieõ)
  for (i = 1; i <= n; i++) pole[i] = "*";
  nlist = lookup(n, base, 1, n, trie_flg, trie_val, listv, listb, listl);
  for (i = 0; i < nlist; i++) {
    for (j = 0; j <= listl[i]; j++) {
      pole[listb[i] + j] = dicp[listv[i]];
    }
  }

  # BIOդ
  for (i = 1; i <= n; i++) tag[i] = "O";
  for (z = 1; z <= nxpr; z++) {
    xpr = xprelem[z];

    p = position(sen, n, surf, xpr, 0);
    if (p == -1) {
        showError("xpr !in sen");
        printf " = Line:%d\n", nlines > "/dev/stderr";
        continue;
     }
    if (tag[PSTART] != "O" || tag[PEND] != "O") continue;

    tag[PSTART] = "B";
    for (i = PSTART + 1; i <= PEND; i++) tag[i] = "I";
  }
  # ǡ
  for (i = 1; i <= n; i++) {
    printf "%s\t%s\t%s\t%s\t%s\t%s\n", surf[i], base[i], cpos[i], fpos[i], pole[i], tag[i];
  }
  printf "\n";
}
