#!/usr/bin/env python

# NER + relation extraction in NLTK

import sys
import string
import re
import nltk
from nltk.tree import *
from nltk.draw import tree
from nltk.sem import relextract, extract_rels, rtuple
from nltk.stem.snowball import SnowballStemmer
#from nltk.tag.stanford import NERTagger

##############################################################
## user-definable

if len(sys.argv) >= 2:
    textname = sys.argv[1]
else:
    exit('need filename on cmd line')
    
##############################################################
## functions

# retrieve named entities into associative array ner
def ne_extract(t, ner):
    if isinstance(t, Tree):
        if t.label() != 'S':
            ner[' '.join(word for word, tag in t.leaves())] = t.label()
        for child in t:
            ne_extract(child, ner)

# remove POS tags
def unpostag(t):
    txt = ''
    if isinstance(t, Tree):
        for child in t:
            txt = txt + ' ' + unpostag(child)
        return txt
    else:
        t = t.split(' ')
        for n in t:
            txt = txt + n.split('/')[0] + ' '
        return txt.strip()

##############################################################
## data structures

# data structures for holding named entities
namedents = {}
namedentsinv = {}

# data structures for holding parse trees
ptrees = list()


##############################################################
## read text
textf = open(textname, "r")
text = textf.read()

##############################################################
## clean up text
# strip weird ASCII
nochangetable = string.maketrans('', '')
deletethese = nochangetable[:9] + nochangetable[11:12] + nochangetable[14:31] + nochangetable[128:]
text = text.translate(nochangetable, deletethese)
spacetable = string.maketrans('\n\x0c', '  ')
text = text.translate(spacetable)
# replace multiple spaces with one
text = ' '.join(text.split())

##############################################################
## frequency analysis
tokens = [word for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
stemmer = SnowballStemmer("english")
stemmed_tokens = [stemmer.stem(t) for t in tokens if t not in nltk.corpus.stopwords.words('english')]
lemmatizer = nltk.WordNetLemmatizer()
lemmed_tokens = [lemmatizer.lemmatize(t) for t in tokens if t not in nltk.corpus.stopwords.words('english')]
stdist = nltk.FreqDist(stemmed_tokens)
lmdist = nltk.FreqDist(lemmed_tokens)
# for i in sorted(stdist.items()):
#     print i
# print "==="
# for i in sorted(lmdist.items()):
#     print i
    

##############################################################
## syntactical/grammatical analysis
# split text into sentences
sentences = nltk.tokenize.sent_tokenize(text)
# tokenize sentences into lists
toksents = [nltk.word_tokenize(sent) for sent in sentences]
# part-of-speech tagging for tokenized sentences
postagsents = [nltk.pos_tag(sent) for sent in toksents]

##############################################################
## named entity recognition

# NLTK's native NER analyzer    
for sentence in postagsents:    
    nersent = nltk.ne_chunk(sentence) 
    ptrees.append(nersent)
    # pair entities to names
    ne_extract(nersent, namedents)

# # Stanford NER link
# st = NERTagger('/usr/local/stanford-ner/classifiers/english.all.3class.distsim.crf.ser.gz', '/usr/local/stanford-ner/stanford-ner.jar', 'utf-8')
# for sentence in sentences:    
#     nersent = st.tag(sentence) 
#     ptrees.append(nersent)
#     ne_extract(nersent, namedents)

# invert the named entities dictionary
for name in namedents.values():
    namedentsinv[name] = list()
for entity,name in namedents.iteritems():
    namedentsinv[name].append(entity)
## print
#for name in namedentsinv:
#    print name, ':', namedentsinv[name]

##############################################################
## relation extraction with named entities
for pt in ptrees:
    pairs = relextract.tree2semi_rel(pt)
    reldicts = relextract.semi_rel2reldict(pairs)
    for r in reldicts:
        print unpostag(r['subjtext']), '|', r['untagged_filler'], '|', unpostag(r['objtext'])
     
# ## extracting the "IN" relationship
# # the "in" pattern
# IN = re.compile(r'.*\bin\b(?!\b.+ing\b)')
# for pt in ptrees:
#     for rel in relextract.extract_rels('ORG', 'LOC', pt, pattern = IN):
#         print(relextract.rtuple(rel))
