#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html

"""
This module replicates the miislita vector spaces from
"A Linear Algebra Approach to the Vector Space Model -- A Fast Track Tutorial"
by Dr. E. Garcia, admin@miislita.com

See http://www.miislita.com for further details.

"""

from __future__ import division  # always use floats
from __future__ import with_statement

import logging
import os
import unittest

from gensim import utils, corpora, models, similarities
from gensim.test.utils import datapath, get_tmpfile

logger = logging.getLogger(__name__)


class CorpusMiislita(corpora.TextCorpus):
    stoplist = set('for a of the and to in on'.split())

    def get_texts(self):
        """
        Parse documents from the .cor file provided in the constructor. Lowercase
        each document and ignore some stopwords.

        .cor format: one document per line, words separated by whitespace.

        """
        for doc in self.getstream():
            yield [word for word in utils.to_unicode(doc).lower().split()
                    if word not in CorpusMiislita.stoplist]

    def __len__(self):
        """Define this so we can use `len(corpus)`"""
        if 'length' not in self.__dict__:
            logger.info("caching corpus size (calculating number of documents)")
            self.length = sum(1 for _ in self.get_texts())
        return self.length


class TestMiislita(unittest.TestCase):
    def test_textcorpus(self):
        """Make sure TextCorpus can be serialized to disk. """
        # construct corpus from file
        miislita = CorpusMiislita(datapath('head500.noblanks.cor.bz2'))

        # make sure serializing works
        ftmp = get_tmpfile('test_textcorpus.mm')
        corpora.MmCorpus.save_corpus(ftmp, miislita)
        self.assertTrue(os.path.exists(ftmp))

        # make sure deserializing gives the same result
        miislita2 = corpora.MmCorpus(ftmp)
        self.assertEqual(list(miislita), list(miislita2))

    def test_save_load_ability(self):
        """
        Make sure we can save and load (un/pickle) TextCorpus objects (as long
        as the underlying input isn't a file-like object; we cannot pickle those).
        """
        # construct corpus from file
        corpusname = datapath('miIslita.cor')
        miislita = CorpusMiislita(corpusname)

        # pickle to disk
        tmpf = get_tmpfile('tc_test.cpickle')
        miislita.save(tmpf)

        miislita2 = CorpusMiislita.load(tmpf)

        self.assertEqual(len(miislita), len(miislita2))
        self.assertEqual(miislita.dictionary.token2id, miislita2.dictionary.token2id)

    def test_miislita_high_level(self):
        # construct corpus from file
        miislita = CorpusMiislita(datapath('miIslita.cor'))

        # initialize tfidf transformation and similarity index
        tfidf = models.TfidfModel(miislita, miislita.dictionary, normalize=False)
        index = similarities.SparseMatrixSimilarity(tfidf[miislita], num_features=len(miislita.dictionary))

        # compare to query
        query = 'latent semantic indexing'
        vec_bow = miislita.dictionary.doc2bow(query.lower().split())
        vec_tfidf = tfidf[vec_bow]

        # perform a similarity query against the corpus
        sims_tfidf = index[vec_tfidf]

        # for the expected results see the article
        expected = [0.0, 0.2560, 0.7022, 0.1524, 0.3334]
        for i, value in enumerate(expected):
            self.assertAlmostEqual(sims_tfidf[i], value, 2)


if __name__ == '__main__':
    logging.basicConfig(level=logging.DEBUG)
    unittest.main()
