#!/usr/bin/env python
# NER on whole document collection
#
# 1. perform NER

############### USER-CONFIGURABLE ################

nerfname = "jobs_ner.csv"

##################################################

import string
import re
import glob
import sys

if sys.version_info < (3,):
    range = xrange
    
import numpy as np
from itertools import product
import math
import csv

# set up MITIE's path
mitiedir = "/usr/local/mitie/"
sys.path.append(mitiedir + '/mitielib')

from mitie import *
from collections import defaultdict

############### FUNCTIONS ################

########################################

## MAIN ##

## read all text files
fileList = glob.glob('*.txt')
#fileList = ['job01.txt', 'job02.txt', 'job05.txt'] ### subset to speed things up

docs = list()
for fn in fileList:
    fd = open(fn, 'r')
    text = fd.read()
    # strip weird ASCII
    nochangetable = string.maketrans('', '')
    deletethese = nochangetable[:9] + nochangetable[11:12] + nochangetable[14:31] + nochangetable[128:]
    text = text.translate(nochangetable, deletethese)
    spacetable = string.maketrans('\n\x0c', '  ')
    text = text.translate(spacetable)
    # replace multiple spaces with one
    text = ' '.join(text.split())
    # save cleaned-up text to document collection
    docs.append(text)
    fd.close()

## find lists of named entities
namedents = list()
counter = 0
nerf = open(nerfname, "w")
csvwr = csv.writer(nerf)
#NERwords = {}
#NERdocname = {}
for text in docs:
    #NERwords[counter] = list()
    #NERdocname[counter] = fileList[counter]
    csvrow = list()
    csvrow.append(counter)
    csvrow.append(fileList[counter])
    print "NER on [", text[0:50], "] (", counter+1, "/", len(docs), ")"
    ner = named_entity_extractor(mitiedir + 'MITIE-models/english/ner_model.dat')
    tokens = tokenize(text)
    ents = ner.extract_entities(tokens)
    entities = list()
    print "  words:", 
    for e in ents:
        rng = e[0]
        tag = e[1]
        score = e[2]
        ent_text = " ".join(tokens[i] for i in rng)
        ent_words = set([t.strip() for t in re.split(',| ', ent_text)])
        for tw in ent_words:
            entities.append((score, tag, tw.lower()))
    namedents.append(entities)
    thewords = set([e[2] for e in namedents[counter]])
    for w in thewords:
        print w,
        csvrow.append(w)
        #NERwords[counter].append(w)
    print
    csvwr.writerow(csvrow)
    counter = counter + 1
nerf.close()

print "END: output written to CSV file", nerfname

