#!/usr/bin/env python
# -*- coding: utf-8 -*-
""" 
ISI parser.
__author__ : alexandre+gargantext @ delanoe.org
__licence__ : GPL version 3.0+
__DATE__ : 2014
__VERSION__ : 1.0
"""

import os, sys
#reload(sys)
import re
import locale
# import hashlib ?

from datetime import datetime, date
from dateutil import parser

from documents.models import Document

#TODO:
# use separators in parameters

class Isi() :
    """
    Thomson ISI parser
    """
    def __init__(self) :
        """
        See Corpus class which declare what a corpus is
        """
        # Specific declarations for Europresse
        self.data       = []

    def read_param(self,file) :
        """
        The file is an init file paramters.
        The function returns a dict of parameters for the following parse function.
        """
        source = open(file,'r')
        lines = source.readlines()
        tags={}
        for line in lines:
            if line[0] != '#':
                tag = line.split('\t')
                tags[str(tag[1])] = [str(tag[0]), str(tag[2])]
        return tags
        source.close()

    def rules(self, parameters) :
        """
        Interpret and does the rules described in parameters.init of each field.
        """
        pass

    def parse(self, source, bdd='isi') :
        """
        The dict needed is parameters, results of read_param function.
        The file needed is the file to be parsed in raw text only.
        """
        lines = source.readlines()
        doc = {}
        if bdd == 'isi':
            try:
                print("reading parameters ISI")
                parameters = self.read_param('sources/parameters/isi.init')
            except Exception as e: print(e)
        elif bdd == 'ris':
            try:
                print("reading parameters RIS")
                parameters = self.read_param('sources/parameters/ris.init')
            except Exception as e: print(e)

        for key in list(parameters.keys()):
            if parameters[key][0] == 'BEGIN' :
                begin = str(key)
                del parameters[begin]
            
            elif parameters[key][0] == 'END' :
                end = str(key)
                del parameters[end]
        
        for line in lines :
            line = str(line, encoding='UTF-8')
            
            if bdd == 'ris':
                line = line.replace(' - ', '')

            if doc == {} and line[:2] == begin :
                #print(line)
                doc['url'] = " "
                key             = ""
                result          = ""

            elif line[:2] in parameters.keys() :
                
                if key != "" and key != line[:2]:
                    try:
                        doc[parameters[key][0]] = result
                    except Exception as e: print(e)
                    #doc.setdefault(parameters[key][0],[]).append(result)
                
                key = line[:2]
                result = line[2:].strip()
                
            elif line[:2] == '  ' :
                try:
                    result = result + ' ' + line[2:].strip()#.split(";")
                    
                except Exception as error :
                    print(error)
            
            elif line[:2] == end :
                doc[parameters[key][0]] = result
                try:
                    try: 
                        date = doc['year'] + " " + doc['month']
                        doc['date'] = parser.parse(date)
                    except:
                        date = doc['year']
                        doc['date'] = datetime.strptime(date, '%Y')

                except Exception as e: 
                    print('88', e)
                    try:
                        print(doc['year'])
                    except Exception as e: print('58',e)
                self.data.append(doc)
                doc = {}

    def add(self, project=None, corpus=None, user=None, ids=None):
        """ Appends notices to self.corpus from self.data removing duplicates"""
        if ids is not None:
            self.object_ids = ids
        else:
            self.object_ids = set()

        for i in self.data:
            if 'uniqu_id' not in i.keys():
                #crypt = md5.new()
                #crypt.update(i['title'])
                #i['uniqu_id'] = crypt.digest()
                i['uniqu_id'] = i['title'] + i['date']

            if i['uniqu_id'] not in self.object_ids and isinstance(i['date'], datetime):
                self.object_ids.add(i['uniqu_id'])
                doc = Document()
                
                try: 
                    doc.project = project
                except Exception as e: print(e)
                
                try:
                    doc.user    = user
                except Exception as e: print(e)

                try:
                    doc.date    = i['date']
                except Exception as e: print(e)
                    
                try:
                    doc.uniqu_id= i['uniqu_id']
                except Exception as e: print(e)
                    
                try:
                    doc.title   = i['title']
                except Exception as e: print(e)

                try:
                    doc.source  = i['source']
                except Exception as e: print(e)
                    
                try:
                    doc.authors = i['authors']
                except Exception as e: print(e)
                    
                try:
                    doc.abstract    = i['abstract']
                except Exception as e: print(e)

                try:
                    doc.save()
                except Exception as e: print(e)
                
                doc.corpus.add(corpus)

        self.data = []


def demo():
    import sys
    data = Isi()
    data.add(parameters=param, file=sys.argv[1])

if __name__ == "__main__" :
    try:
        demo()
    except Exception as error :
        print(sys.exc_traceback.tb_lineno, error)