#!/usr/bin/env python
# NER + relation extraction with MITIE

import string
import re
import glob
import sys

# set up MITIE's path
mitiedir = "/usr/local/mitie/"
sys.path.append(mitiedir + '/mitielib')

from mitie import *
from collections import defaultdict

# relation extractor function
def relextract(extrname, extractor, entities):
    rel_detector = binary_relation_detector(extractor)
    neighboring_entities = [(entities[i][0], entities[i+1][0]) for i in xrange(len(entities)-1)]
    neighboring_entities += [(r,l) for (l,r) in neighboring_entities]
    score = 0
    relations = list()
    for ent1, ent2 in neighboring_entities:
        rel = ner.extract_binary_relation(tokens, ent1, ent2)
        score = rel_detector(rel)
        if (score > 0):
            ent1_text = " ".join(tokens[i] for i in ent1)
            ent2_text = " ".join(tokens[i] for i in ent2)
            relations.append((ent1_text,ent2_text))
    return relations

if len(sys.argv) >= 2:
    textname = sys.argv[1]
else:
    sys.exit('need filename on cmd line')

textf = open(textname, "r")
text = textf.read()

# strip weird ASCII
nochangetable = string.maketrans('', '')
deletethese = nochangetable[:9] + nochangetable[11:12] + nochangetable[14:31] + nochangetable[128:]
text = text.translate(nochangetable, deletethese)
spacetable = string.maketrans('\n\x0c', '  ')
text = text.translate(spacetable)
# replace multiple spaces with one
text = ' '.join(text.split())

# MITIE

# recognize named entities
ner = named_entity_extractor(mitiedir + 'MITIE-models/english/ner_model.dat')
#tokens = tokenize(load_entire_file(textname))
tokens = tokenize(text)
ents = ner.extract_entities(tokens)
entities = list()
for e in ents:
    range = e[0]
    tag = e[1]
    score = e[2]
    ent_text = " ".join(tokens[i] for i in range)
    entities.append((score, tag, ent_text))

# entity to tag
ent2tag = {}
ent2tagscores = {}
for e in entities:
    score = e[0]
    tag = e[1]
    ent = e[2]
    if ent in ent2tagscores and score > ent2tagscores[ent]:
        ent2tag[ent] = tag
    else:
        ent2tag[ent] = tag

# invert ent2tag map
tag2ent = {}
for e, t in ent2tag.iteritems():
    tag2ent.setdefault(t, []).append(e)


# extract relations
rxfiledirs = glob.glob(mitiedir + "MITIE-models/english/binary_relations/rel_classifier_*.svm")
rxfiles = [r.split('/')[-1] for r in rxfiledirs]
rxdetails = [r.split('.')[:-1] for r in rxfiles]
rxnames = [r[-1] for r in rxdetails] 

relations = list()
for i,v in enumerate(rxfiledirs):
    extractor = v
    extrname = rxnames[i]
    rxrels = relextract(extrname, extractor, ents)
    for r in rxrels:
        relations.append((extrname, r[0], r[1]))

print "==== NAMED ENTITIES ====="
for e,t in sorted(ent2tag.iteritems()):
    print e, t

print "======= RELATIONS ======="
for r in sorted(relations):
    print r[1], "[", str.upper(r[0]), "]", r[2]
    
