#!/usr/bin/env python

## text mining - extract important words from a set of documents

#from __future__ import print_function

import sys
import glob
import numpy as np
import math

from sklearn.datasets import fetch_20newsgroups
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Normalizer
from sklearn import metrics
from sklearn.cluster import KMeans, MiniBatchKMeans

# plots
from mpl_toolkits.mplot3d import Axes3D
from mpl_toolkits.mplot3d import proj3d
import matplotlib.pyplot as plt
from matplotlib.path import Path
import matplotlib.patches as patches

######################### user definable #############################

featCard = 100
clustCard = 2
savefilename = "jobs_kmeans.png"

#######################################################################

######################### functions #############################

def PCA(B, K = 2):
    (evals,evecs) = np.linalg.eigh(B)
    evals[evals < 0] = 0
    n = B.shape[0]
    if K < n:
        # actual projection
        x = evecs[:,-K:] #np.transpose(evecs[:,-K:])
        for k in range(K):
            x[k] = x[k] * math.sqrt(evals[-k-1])
    else:
        # return a factor permuted w.r.t. decreasing eigenvalue sequence
        x = evecs[::-1] #np.transpose(evecs[::-1])
        for k in range(n):
            x[k] = x[k] * math.sqrt(evals[::-1][k])
    return x

def plot2D(x, clustering):
    fig = plt.figure()
    nclust = max(clustering)+1
    n = len(clustering)
    # two-dimensional plots on the xy plane
    ax = fig.add_subplot(111)
    # points and labels
    for k in range(nclust):
        rr = float(k) / nclust
        gg = float(((k + 1) % nclust)) / nclust
        bb = float(((k + 2) % nclust)) / nclust
        for i in range(n):
            if clustering[i] == k:
                ax.plot(x[i,0], x[i,1], c=(rr,gg,bb), marker='o', linestyle='None', ms=float(15))
                ax.annotate(str(i), xy=(x[i,0],x[i,1]), xytext=(-8,8), textcoords='offset points')
    # axes
    epsx = 0.1*(max(x[:,0])-min(x[:,0]))
    epsy = 0.1*(max(x[:,1])-min(x[:,1]))
    xmin = min(x[:,0]) - epsx
    xmax = max(x[:,0]) + epsx
    ymin = min(x[:,1]) - epsx
    ymax = max(x[:,1]) + epsx
    ax.axis([xmin, xmax, ymin, ymax])
    plt.savefig(savefilename)
    plt.show()

#######################################################################

## read all text files
fileList = glob.glob('*.txt')
docs = list()
for fn in fileList:
    fd = open(fn, 'r')
    docs.append(fd.read())
    fd.close()

# number of documents |C|
m = len(docs)
# number of features (words)
n = featCard

## hand-made list of features
# features = dict()
# features['phd'] = 0
# features['optimization'] = 1
# features['mathematical'] = 2
# features['organization'] = 3
# features['marketing'] = 4
# n = len(features)

## vector space model:
##   tf(t,d)=|occurrences of word t in doc d|
##   idf(t,C)= |C| / |{d in C : t in d}|
##   tfidf(t,d,C) = tf(t,d)*idf(t,C)
##   each document d in C is transformed to the vector (tfidf(t,d,C) : t in d)
vectorizer = TfidfVectorizer(max_df=0.5, max_features=n,
                             min_df=2, stop_words='english',
#                             vocabulary = features,
                             use_idf=True)
## data set: |C|=m x n
##   informally: extract a set of n most meaningful words t from C,
##               then each document has weight of t as t-th component
X = vectorizer.fit_transform(docs)

#print X.toarray()
#print X.shape

## run k-means on X
km = KMeans(n_clusters=clustCard, init='k-means++', max_iter=100, n_init=1,
            verbose=False).fit(X)
clustering = km.labels_

## print out clusters (document IDs)
for k in range(clustCard):
    print "cluster {0:d}:".format(k),
    for i in range(m):
        if clustering[i] == k:
            print " {0:d}".format(i),
    print

# order_centroids = km.cluster_centers_.argsort()[:, ::-1]
# print order_centroids
# terms = vectorizer.get_feature_names()
# for i in range(clustCard):
#     print "cluster {0:d}:".format(i), 
#     for j in order_centroids[i]:
#         print " {0:s}".format(terms[j]),
#     print

## Gram matrix of the document metric
G = np.dot(X,X.T).toarray()
Gdiag = np.diag(G)
Ones = np.ones(m)
## sqEDM of the document metric
D2 = (Gdiag.T.dot(Ones) + Ones.T.dot(Gdiag) - 2*G)
## EDM of the document metric
D = np.sqrt(D2)

## PCA in 2 dimensions
x = PCA(G)
#plot2D(x, [0]*m)
plot2D(x, clustering)


