from node.models import Node, NodeType, Language, Ngram, Node_Ngram
from parsing.NgramsExtractors import *

import collections
import dateutil.parser


class NgramCache:
    """
    This allows the fast retrieval of ngram ids
    from the cache instead of using the database for every call
    """
    
    def __init__(self, language):
        self._cache = dict()
        self._language = language
            
    def __getitem__(self, terms):
        terms = terms.strip().lower()
        if terms not in self._cache:
            try:
                ngram = Ngram.get(terms=terms, language=self._language)
            except:
                ngram = Ngram(terms=terms, n=len(terms.split()), language=self._language)
                ngram.save()
            self._cache[terms] = ngram
        return self._cache[terms]

        
class NgramCaches(collections.defaultdict):

    def __missing__(self, language):
        self[language] = NgramCache(language)
        return self[language]
        
    

"""Base class for performing files parsing depending on their type.
"""
class FileParser:
    
    def __init__(self, file=None, filepath="", encoding="utf8"):
        # ...get the file item...
        if file is None:
            self._file = open(filepath, "rb")
        else:
            self._file = file
        # cache for ngrams
        self._ngramcaches = NgramCaches()
        # extractors
        self._extractors = dict()
        self._document_nodetype = NodeType.objects.get(name='Document')
        languages = Language.objects.all()
        self._languages_fullname = {language.fullname.lower(): language for language in languages}
        self._languages_iso2 = {language.iso2.lower(): language for language in languages}
        self._languages_iso3 = {language.iso3.lower(): language for language in languages}
        #self.parse()
    
    """Extract the ngrams from a given text.
    """
    def extract_ngrams(self, text, language):
        # Get the appropriate ngrams extractor, if it exists
        if language not in self._extractors:
            extractor = None
            if language.iso2 == 'en':
                extractor = EnglishNgramsExtractor()
            elif language.iso2 == 'fr':
                extractor = FrenchNgramsExtractor()
            self._extractors[language] = extractor
        else:
            extractor = self._extractors[language]
        # Extract the ngrams
        if extractor:
            tokens = []
            for ngram in extractor.extract_ngrams(text):
                ngram_text = ' '.join([token for token, tag in ngram])
                tokens.append(ngram_text)
            return collections.Counter(
#                [token for token, tag in extractor.extract_ngrams(text)]
                tokens
            )
        else:
            return dict()
    
#TODO
#   * make it possible to tag and parse separately
#   * only tags some data (only titles, titles & abstracts, some chapters...)

    """Add a document to the database.
    """
    def create_document(self, parentNode, title, contents, language, metadata, guid=None):
        metadata = self.format_metadata(metadata)
        # create or retrieve a resource for that document, based on its user id
#        if guid is None:
#            resource = Resource(guid=guid)
#        else:
#            try:
#                resource = Resource.get(guid=guid)
#            except:
#                resource = Resource(guid=guid)
#        # If the parent node already has a child with this resource, pass
#        # (is it a good thing?)
#        if parentNode.descendants().filter(resource=resource).exists():
#            return None
        # create the document itself
        
        if len(title) > 200:
            title = title[:200]

        childNode = Node(
            user        = parentNode.user,
            type        = self._document_nodetype,
            name        = title,
            language    = language,
            metadata    = metadata,
            #resource    = resource,
            parent      = parentNode
        )
        childNode.save()
            
        # parse it!
        ngrams = self.extract_ngrams(contents, language)
        # we are already in a transaction, so no use doing another one (or is there?)
        ngramcache = self._ngramcaches[language]
        for terms, occurences in ngrams.items():
            ngram = ngramcache[terms]
            Node_Ngram(
                node       = childNode,
                ngram      = ngram,
                occurences = occurences
            ).save()
                
        # return the created document
        return childNode
    
    """Useful method to detect the document encoding.
    Not sure it should be here actually.
    """
    def detect_encoding(self, string):
        # see the chardet library
        pass
    
    """Parse the data.
    This method shall be overriden by inherited classes.
    """
    def parse(self):
        return list()


    def format_metadata_dates(self, metadata):
        """Format the dates found in the metadata.
        Example: {"publication_date": "2014-10-23 09:57:42"} -> {...}
        """
        
        # First, check the split dates...
        prefixes = [key[:-5] for key in metadata.keys() if key[-5:] == "_year"]
        for prefix in prefixes:
            date_string = metadata[prefix + "_year"]
            key = prefix + "_month"
            if key in metadata:
                date_string += " " + metadata[key]
                key = prefix + "_day"
                if key in metadata:
                    date_string += " " + metadata[key]
                    key = prefix + "_hour"
                    if key in metadata:
                        date_string += " " + metadata[key]
                        key = prefix + "_minute"
                        if key in metadata:
                            date_string += ":" + metadata[key]
                            key = prefix + "_second"
                            if key in metadata:
                                date_string += ":" + metadata[key]
            try:
                metadata[prefix + "_date"] = dateutil.parser.parse(date_string).strftime("%Y-%m-%d %H:%M:%S")
            except:
                pass
        
        # ...then parse all the "date" fields, to parse it into separate elements
        prefixes = [key[:-5] for key in metadata.keys() if key[-5:] == "_date"]
        for prefix in prefixes:
            date = dateutil.parser.parse(metadata[prefix + "_date"])
            metadata[prefix + "_year"]      = date.strftime("%Y")
            metadata[prefix + "_month"]     = date.strftime("%m")
            metadata[prefix + "_day"]       = date.strftime("%d")
            metadata[prefix + "_hour"]      = date.strftime("%H")
            metadata[prefix + "_minute"]    = date.strftime("%M")
            metadata[prefix + "_second"]    = date.strftime("%S")
                
        # finally, return the result!
        return metadata
    
    def format_metadata(self, metadata):
        """Format the metadata."""
        metadata = self.format_metadata_dates(metadata)
        return metadata