#!/usr/bin/env python

## write instance data for MSSC (mssc.mod)

#from __future__ import print_function
from __future__ import unicode_literals

import sys
import glob
import numpy as np
from scipy import sparse
import math
import operator
import string
import re
import csv

from collections import defaultdict

from sklearn.datasets import fetch_20newsgroups
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Normalizer
from sklearn import metrics
from sklearn.cluster import KMeans, MiniBatchKMeans

######################### user definable #############################

featCard = 100
clustCard = 5
eps = 1e-6
nerfname = "jobs_tfidf.csv"
MagicFactor = 1.75 # see maths/dr2/code/jll/quantile.py
achP = 1.0 / 6.0

######################### FUNCTIONS ###############################

### generate a samples from an Achlioptas distribution
def achlioptas_sample(p=1.0/6.0):
    ret = 0
    uniformsample = np.random.uniform(0,1)
    if uniformsample < p:
        ret = -1
    elif uniformsample > 1-p:
        ret = 1
    else:
        ret = 0
    return ret

### generate an Achlioptas m x n random projection matrix
def achlioptas(m, n, p=1.0/6.0):
    if p == 0:
        p = 1.0/6.0
    A = np.zeros((m,n))
    for i in range(m):
        for j in range(n):
            aij = achlioptas_sample(p)
            if aij != 0:
                A[i,j] = aij
    return A


#######################################################################

## main

if len(sys.argv) < 2:
    exit('need number of clusters k on cmd line')

k = int(sys.argv[1])
if k < 2:
    exit('k must be at least 2')

## read all text files
fileList = glob.glob('*.txt')
docs = list()
for fn in fileList:
    fd = open(fn, 'r')
    docs.append(fd.read())
    fd.close()
    
# number of documents |C|
m = len(docs)
# number of features (words)
n = featCard

vectorizer = TfidfVectorizer(max_df=0.5, max_features=n,
                             min_df=2, stop_words='english',
                             use_idf=True)
# this is an docs x clustCard matrix, i.e. m x n
X = vectorizer.fit_transform(docs)
# random projections: arbitrarily decide k at n/2
K = int(round(0.5*float(n)))
# sample an Achlioptas K x n matrix
A = sparse.csr_matrix(MagicFactor * (1.0 / math.sqrt(K)) * achlioptas(n,K, achP))
Y = X.dot(A)
rows,cols = Y.nonzero()
for i,j in zip(rows, cols):
    if Y[i,j] < eps:
        Y[i,j] = 0
Y.eliminate_zeros()
    
datfn = "mssc-rp-"+str(k)+"_"+str(m)+"_"+str(K)+".dat"

outf = open(datfn, "w")
print >> outf, "# mssc.dat written by mssc_data.py"
print >> outf, "param k :=", k, ";"
print >> outf, "param n :=", m, ";"
print >> outf, "param d :=", K, ";"
print >> outf, "param p :="
rows,cols = Y.nonzero()
for i,j in zip(rows, cols):
    print >> outf, " ", i+1, j+1, Y[i,j]
print >> outf, ";"
outf.close()
print "output written to", datfn
