#!/usr/bin/env python
# modularity clustering on document collection,
#   given a list of keywords about each document in collection
#
# 1. compute similarities using WordNet
# 2. cut weak similarities to "no edge"
# 3. perform modularity clustering

############### USER-CONFIGURABLE ################

eps = 1e-3
zeroeps = 1e-10
nerfname = "jobs_ner.csv"

##################################################

import string
import re
import glob
import sys

if sys.version_info < (3,):
    range = xrange
    
from nltk.corpus import wordnet
import numpy as np
from itertools import product
import math
from tabulate import tabulate
import csv

## optlang -- select solver
#from optlang import Variable, Constraint, Objective, Model
from optlang.glpk_interface import Model, Variable, Constraint, Objective
#from optlang.cplex_interface import Model, Variable, Constraint, Objective

# plots
from mpl_toolkits.mplot3d import Axes3D
from mpl_toolkits.mplot3d import proj3d
import matplotlib.pyplot as plt
from matplotlib.path import Path
import matplotlib.patches as patches

# NetworkX
import networkx as nx

############### FUNCTIONS ################

# relation extractor function (wordnet-related)
def relextract(extrname, extractor, entities):
    rel_detector = binary_relation_detector(extractor)
    neighboring_entities = [(entities[i][0], entities[i+1][0]) for i in xrange(len(entities)-1)]
    neighboring_entities += [(r,l) for (l,r) in neighboring_entities]
    score = 0
    relations = list()
    for ent1, ent2 in neighboring_entities:
        rel = ner.extract_binary_relation(tokens, ent1, ent2)
        score = rel_detector(rel)
        if (score > 0):
            ent1_text = " ".join(tokens[i] for i in ent1)
            ent2_text = " ".join(tokens[i] for i in ent2)
            relations.append((ent1_text,ent2_text))
    return relations

# extension to sets of words (of different size) of Wu-Palmer similarity
def WUPSimilarity(list1, list2):
    syn1 = set(ss for word in list1 for ss in wordnet.synsets(word))
    syn2 = set(ss for word in list2 for ss in wordnet.synsets(word))
    D = {(s1,s2) : wordnet.wup_similarity(s1,s2)}
    d12 = sum(D.values()) / (len(syn1)*len(syn2))
    return d12

# extension to sets of words from two documents (u,v in d1 and d2 -> wup(uv)=1)
def WUPSimilarityDoc(list1, list2):
    syn1 = set(ss for word in list1 for ss in wordnet.synsets(word))
    syn2 = set(ss for word in list2 for ss in wordnet.synsets(word))
    D = {(s1,s2) : wordnet.wup_similarity(s1,s2) or 1 for s1,s2 in product(syn1,syn2)}
    d12 = sum(D.values()) / (len(syn1)*len(syn2))
    return d12

# principal component analysis
def PCA(B, K = 2):
    (evals,evecs) = np.linalg.eigh(B)
    evals[evals < 0] = 0
    n = B.shape[0]
    if K < n:
        # actual projection
        x = evecs[:,-K:] #np.transpose(evecs[:,-K:])
        for k in range(K):
            x[k] = x[k] * math.sqrt(evals[-k-1])
    else:
        # return a factor permuted w.r.t. decreasing eigenvalue sequence
        x = evecs[::-1] #np.transpose(evecs[::-1])
        for k in range(n):
            x[k] = x[k] * math.sqrt(evals[::-1][k])
    return x

# plot 2-dimensional vector sets
def plot2D(x, clustering = []):
    n = x.shape[0]
    if len(clustering) == 0:
        clustering = [0]*n
    fig = plt.figure()
    nclust = max(clustering)+1
    # two-dimensional plots on the xy plane
    ax = fig.add_subplot(111)
    # points and labels
    for k in range(nclust):
        rr = float(k) / nclust
        gg = float(((k + 1) % nclust)) / nclust
        bb = float(((k + 2) % nclust)) / nclust
        for i in range(n):
            if clustering[i] == k:
                ax.plot(x[i,0], x[i,1], c=(rr,gg,bb), marker='o', linestyle='None', ms=float(9))
                ax.annotate(str(i), xy=(x[i,0],x[i,1]), xytext=(-6,6), textcoords='offset points')
    # axes
    epsx = 0.1*(max(x[:,0])-min(x[:,0]))
    epsy = 0.1*(max(x[:,1])-min(x[:,1]))
    xmin = min(x[:,0]) - epsx
    xmax = max(x[:,0]) + epsx
    ymin = min(x[:,1]) - epsx
    ymax = max(x[:,1]) + epsx
    ax.axis([xmin, xmax, ymin, ymax])
    plt.show()

########################################

## MAIN ##

if len(sys.argv) >= 2:
    nerfname = sys.argv[1]

## input data from CSV file
print "reading from CSV...", nerfname
nerf = open(nerfname, 'rb')
reader = csv.reader(nerf)
NERdocname = {}
NERwords = {}
for row in reader:
    rlen = len(row)
    if rlen < 3:
        exit('CSV has wrong format (expecting at least 3 cols)')
    docindex = int(row[0])
    NERdocname[docindex] = str(row[1])
    NERwords[docindex] = list()
    for i in range(2, rlen):
        NERwords[docindex].append(str(row[i]))

## find similarities between documents
print "computing the document similarity matrix using Wu-Palmer..."
m = len(NERwords)
G = np.ones((m,m))
for u in range(m):
    for v in range(m):
        if u < v:
            neu = set([w for w in NERwords[u]])
            nev = set([w for w in NERwords[v]])
            G[u,v] = WUPSimilarityDoc(neu, nev)
            G[v,u] = G[u,v]
            
## create modularity graph to cluster

# ## obtain distances from similarities
# ## (we should do PCA or reflection on Gdiag if there are negative eigenvalues)
# Gdiag = np.diag(G)
# Ones = np.ones(m)
# D2 = (Gdiag.T.dot(Ones) + Ones.T.dot(Gdiag) - 2*G)
# D = np.sqrt(D2)

# extract edges with highest weight
Gmedian = np.median(G)
E = [(u,v) for u in range(m) for v in range(m) if G[u,v] >= Gmedian]
GZ = G.copy()
for u in range(m):
    for v in range(m):
        if (u,v) not in E:
            GZ[u,v] = 0
# repair unconnectedness (arbitrarily link to doc of closest index)
for u in range(m):
    if sum(GZ[u,v] for v in range(m)) < zeroeps:
        v = (u + 1) % m
        GZ[u,v] = Dmedian
        GZ[v,u] = Dmedian
        E.append((u,v))
        E.append((v,u))
        
## print out the similarity matrices in latex form
#print(tabulate(G, tablefmt="latex", floatfmt=".2f"))
#print(tabulate(GZ, tablefmt="latex", floatfmt=".2f"))

# print instance out in AMPL .dat format (for modularity.sh script)
datf = open("jobs.dat", "w")
print >> datf, "param n :=", m, ";" 
print >> datf, "param a :="
for (i,j) in E:
    print >> datf, i+1, j+1, GZ[i,j]
print >> datf, ";"
datf.close()

print "formulating and solving modularity clustering instance..."

## modularity clustering model in Python's optlang
## parameters

# weighted degree sequences
wdegseq = [0]*m
for i in range(m):
    wdegseq[i] = sum(G[i,j] for j in range(m) if (i,j) in E)
# m squared
m2 = sum(wdegseq[i] for i in range(m)) / 2.0
# constant in the obj
C = sum(wdegseq[i]*wdegseq[i] for i in range(m)) / (4*m2*m2)

# obj coefficients
coeff = dict()
for i in range(m):
    coeff[i] = dict()
    for j in range(m):
        coeff[i][j] = - wdegseq[i] * wdegseq[j] / (2*m2)
        if (i,j) in E:
            coeff[i][j] = coeff[i][j] + GZ[i,j]
        
## variables
var = {}
for i in range(m):
    var[i] = {}
    for j in range(m):
        vr = Variable(name = "x_{}_{}".format(i,j), type="binary")
        var[i][j] = vr

## objective (the constant C is added later, GKLP allows no constant)
obj = Objective(
    (1/float(m2)) *
    sum(coeff[i][j] * var[i][j] for i in range(m) for j in range(m) if i<j),
    direction = "max"
)

## constraints
constr = []
for i in range(m):
    for j in range(m):
        for h in range(m):
            if i<j and j<h:
                cnst = Constraint(var[i][j] + var[j][h] - var[i][h], ub = 1.0,
                                  name = "clique1_{}_{}_{}".format(i,j,h))
                constr.append(cnst)
for i in range(m):
    for j in range(m):
        for h in range(m):
            if i<j and j<h:
                cnst = Constraint(var[i][j] - var[j][h] + var[i][h], ub = 1.0,
                                  name = "clique2_{}_{}_{}".format(i,j,h))
                constr.append(cnst)
for i in range(m):
    for j in range(m):
        for h in range(m):
            if i<j and j<h:
                cnst = Constraint(-var[i][j] + var[j][h] + var[i][h], ub = 1.0,
                                  name = "clique3_{}_{}_{}".format(i,j,h))
                constr.append(cnst)
for i in range(m):
    for j in range(m):
        if i < j:
            cnst = Constraint(var[i][j] - var[j][i], lb = 0.0, ub = 0.0,
                              name = "symm_{}_{}".format(i,j))
            constr.append(cnst)
for i in range(m):
    cnst = Constraint(var[i][i], lb = 1.0, ub = 1.0, name = "refl_{}".format(i))
    constr.append(cnst)

# ## print out model for validation purposes
# print obj
# for cnst in constr:
#     print cnst
    
## model and solve
model = Model()
model.configuration.verbosity = 3
model.objective = obj
model.add(constr)
status = model.optimize()
print "Status:", status
print "Obj fun value =", model.objective.value - C

clustering = [-1.0]*m
clindex = 0
for vr in model.variables:
    if vr.primal > eps:
        vn = vr.name.split('_')
        i = int(vn[1])
        j = int(vn[2])
        if clustering[i] < 0 and clustering[j] < 0:
            clustering[i] = clindex
            clustering[j] = clindex
            clindex = clindex + 1
        elif clustering[i] >= 0:
            clustering[j] = clustering[i]
        elif clustering[j] >= 0:
            clustering[i] = clustering[j]

# output
clusters = max(clustering) + 1
for clindex in range(clusters):
    print "cluster", clindex, ":",
    for i,c in enumerate(clustering):
        if c == clindex:
            print NERdocname[i],
    print

## draw the graph and the clustering
Gph = nx.Graph()
for e in E:
    i = e[0]
    j = e[1]
    if i < j:
        Gph.add_edge(i,j,weight=float(G[i,j]))
labels = {}
for i in range(m):
    labels[i] = NERdocname[i]

colors = [round((float(c)/float(clusters)+0.5)/2.0, 2) for c in clustering]
pos = nx.spring_layout(Gph)

print "color values =", colors

nx.draw(Gph, pos, node_color=colors, edge_color='gray', node_size=500, with_labels=True)
offset = 0.5 * pos.values()[0]
labelpos = [np.array(p) + offset for p in pos.values()]
nx.draw_networkx_labels(Gph, labelpos, labels, font_size=10)
plt.axis('off')
plt.savefig("jobs_modularity.png", format="PNG")
plt.show()

## draw the PCA embedding in the plane
#x = PCA(G)
#plot2D(x, clustering)


########################## OBLIVION ##########################

