#!/usr/bin/env python

## write instance data for MSSC (mssc.mod)

#from __future__ import print_function
from __future__ import unicode_literals

import sys
import glob
import numpy as np
import math
import operator
import string
import re
import csv

from collections import defaultdict

from sklearn.datasets import fetch_20newsgroups
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Normalizer
from sklearn import metrics
from sklearn.cluster import KMeans, MiniBatchKMeans

######################### user definable #############################

featCard = 100
clustCard = 5
eps = 1e-6
nerfname = "jobs_tfidf.csv"

######################### FUNCTIONS ###############################

#######################################################################

## main

if len(sys.argv) < 2:
    exit('need number of clusters k on cmd line')

k = int(sys.argv[1])
if k < 2:
    exit('k must be at least 2')

## read all text files
fileList = glob.glob('*.txt')
docs = list()
for fn in fileList:
    fd = open(fn, 'r')
    docs.append(fd.read())
    fd.close()
    
# number of documents |C|
m = len(docs)
# number of features (words)
n = featCard

vectorizer = TfidfVectorizer(max_df=0.5, max_features=n,
                             min_df=2, stop_words='english',
                             use_idf=True)
X = vectorizer.fit_transform(docs)

datfn = "mssc-" + str(k) + "_" + str(m) + "_" + str(n) + ".dat"

outf = open(datfn, "w")
print >> outf, "# mssc.dat written by mssc_data.py"
print >> outf, "param k :=", k, ";"
print >> outf, "param n :=", m, ";"
print >> outf, "param d :=", n, ";"
print >> outf, "param p :="
rows,cols = X.nonzero()
for i,j in zip(rows, cols):
    print >> outf, " ", i+1, j+1, X[i,j]
print >> outf, ";"
outf.close()
print "output written to", datfn
