#!/usr/bin/env python

## text mining - extract important words from a set of documents

#from __future__ import print_function
from __future__ import unicode_literals

import sys
import glob
import numpy as np
import math
import operator
import string
import re
import csv

from collections import defaultdict

from sklearn.datasets import fetch_20newsgroups
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Normalizer
from sklearn import metrics
from sklearn.cluster import KMeans, MiniBatchKMeans

# plots
from mpl_toolkits.mplot3d import Axes3D
from mpl_toolkits.mplot3d import proj3d
import matplotlib.pyplot as plt
from matplotlib.path import Path
import matplotlib.patches as patches

######################### user definable #############################

featCard = 100
clustCard = 5
eps = 1e-6
nerfname = "jobs_tfidf.csv"

######################### FUNCTIONS ###############################

def invert_dict(d):
    d_inv = defaultdict(list)
    for k, v in d.iteritems():
        d_inv[v].append(k)
    return d_inv

def asciionly(w):
    r = list()
    for c in w:
        if ord(c) >= 48 and ord(c) <= 127 and c not in string.punctuation :
            r.append(c)
    rs = ''.join(r)
    return rs

#######################################################################

## read all text files
fileList = glob.glob('*.txt')
docs = list()
for fn in fileList:
    fd = open(fn, 'r')
    docs.append(fd.read())
    fd.close()
    
# number of documents |C|
m = len(docs)
# number of features (words)
n = featCard

## vector space model:
##   tf(t,d)=|occurrences of word t in doc d|
##   idf(t,C)= |C| / |{d in C : t in d}|
##   tfidf(t,d,C) = tf(t,d)*idf(t,C)
##   each document d in C is transformed to the vector (tfidf(t,d,C) : t in d)
vectorizer = TfidfVectorizer(max_df=0.5, max_features=n,
                             min_df=2, stop_words='english',
                             use_idf=True)

## data set: |C|=m x n
##   informally: extract a set of n most meaningful words t from C,
##               then each document has weight of t as t-th component
X = vectorizer.fit_transform(docs)
termindex = vectorizer.vocabulary_
## other interesting data
idf = vectorizer.idf_
words = vectorizer.get_feature_names()
vocab = dict(zip(words, idf))
sortedvocab = sorted(vocab.iteritems(), key=operator.itemgetter(1))
sortedkeys = [str(s[0]) for s in sortedvocab]
#vocabinv = dict(invert_dict(vocab))

## write CSV
nerf = open(nerfname, "w")
csvwr = csv.writer(nerf)
counter = 0
#NERwords = {}
#NERdocname = {}
for d in docs:
    #NERwords[counter] = list()
    #NERdocname[counter] = fileList[counter]
    csvrow = list()
    csvrow.append(counter)
    csvrow.append(fileList[counter])
    print counter, ":",
    for w1 in d.split():
        w = asciionly(w1.strip()).lower()
        if w in termindex.keys():
            tiw = int(termindex[w])
            if X[counter,tiw] > eps:
                print w,
                #NERwords[counter].append(w)
                csvrow.append(w)
    csvwr.writerow(csvrow)
    counter = counter + 1
    print
nerf.close()

## output keywords
print "all keywords in collection:".upper(),
for w in sortedkeys:
    print w,
print

print "END: output written to CSV file", nerfname

