#!/usr/bin/env python
# -*- coding: utf-8 -*-
""" 
Europresse Database parser for HTML sources only. 

This script is using 3 methods of parsing:

    1) REGEX (Regular Expressions) format detection
    2) SAX (Simple Api for Xml) like method for events detection
    3) DOM (Document Object Model), operating on the document as a whole for
    tree detection.

Bug reports? Please contact the author:
__author__ : alexandre+gargantext @ delanoe.org
__licence__ : GPL version 3.0+
__DATE__ : 09 november 2013
__VERSION__ : 2.0
"""

import os
import sys
import imp
imp.reload(sys)
import re

import locale

from datetime import datetime, date
from lxml import etree

from documents.models import Document
#from .corpus import Corpus

class Europresse():
    """
    1) First build tree to parse data
    2) Then each notice (article) is nested in a dictionary,
    3) Finaly, corpus is a list of articles as dictionnaries.
    """

    def __init__(self):
        """self.corpus is a list
        articles is the list of articles in the HTML page
        article is an article as dict"""

        # Specific declarations for Europresse
        self.data       = []

        # Encoding
        self.codif      = "UTF-8"
        self.localeEncoding = "fr_FR"

    def test_unicode(self, filename):
        import os
        os.system("file_europresse=$(mktemp -q); file --mime-encoding \'%s\' | grep -i -- \"iso-8859\" && \
                iconv -f latin1 -t utf8 \'%s\' > $file_europresse && \
                mv $file_europresse \'%s\'" % (filename, filename, filename))

    def parse(self, filename):
        """Adding filename to self.data after parsing"""
        count = 0
        articles   = []
        article    = {}

        parser = etree.HTMLParser(encoding=self.codif)
        tree = etree.parse(filename, parser)
        articles = tree.xpath('/html/body/table')

        for notice in articles:
            if len(notice):
                for name in notice.xpath("./tr/td/span[@class = 'DocPublicationName']"):
                    if name.text is not None:
                        format_journal = re.compile('(.*), (.*)', re.UNICODE)
                        test_journal = format_journal.match(name.text)
                        if test_journal is not None:
                            article['source'] = test_journal.group(1)
                            article['volume'] = test_journal.group(2)
                        else:
                            article['source'] = name.text.encode(self.codif)

                for header in notice.xpath("./tr/td/span[@class = 'DocHeader']"):
                    text = header.text
                    if isinstance(text, bytes):
                        text = text.decode()

                    format_date_fr = re.compile('\d+\s*\w+\s+\d{4}', re.UNICODE)
                    test_date_fr = format_date_fr.match(text)
                    
                    format_date_en = re.compile('\w+\s+\d+,\s+\d{4}', re.UNICODE)
                    test_date_en = format_date_en.match(text)

                    format_sect = re.compile('(\D+),', re.UNICODE)
                    test_sect = format_sect.match(text)
                    
                    format_page = re.compile(', p. (\w+)', re.UNICODE)
                    test_page = format_page.match(text)
                    
                    if test_date_fr is not None:
                        self.localeEncoding = "fr_FR"
                        locale.setlocale(locale.LC_ALL, self.localeEncoding)
                        try :
                            article['date'] = datetime.strptime(text, '%d %B %Y')
                        except :
                            try:
                                article['date'] = datetime.strptime(text, '%B %Y')
                            except :
                                pass
                    
                    if test_date_en is not None:
                        self.localeEncoding = "en_GB.UTF-8"
                        locale.setlocale(locale.LC_ALL, self.localeEncoding)
                        try :
                            article['date'] = datetime.strptime(text, '%B %d, %Y')
                        except :
                            try :
                                article['date'] = datetime.strptime(text, '%B %Y')
                            except :
                                pass

                    if test_sect is not None:
                        article['section'] = test_sect.group(1).encode(self.codif)
                    
                    if test_page is not None:
                        article['page'] = test_page.group(1).encode(self.codif)

                article['title'] = notice.xpath("string(./tr/td/span[@class = 'TitreArticleVisu'])").encode(self.codif)
                article['text'] = notice.xpath("./tr/td/descendant-or-self::*[not(self::span[@class='DocHeader'])]/text()")
               
                line = 0
                br_tag = 10
                for i in articles[count].iter():
                   # print line, br, i, i.tag, i.attrib, i.tail
                    if i.tag == "span":
                        if "class" in i.attrib:
                            if i.attrib['class'] == 'TitreArticleVisu':
                                line = 1
                                br_tag = 2
                    if line == 1 and i.tag == "br":
                        br_tag -= 1
                    if line == 1 and br_tag == 0:
                        try:
                            article['authors'] = str.title(etree.tostring(i, method="text", encoding=self.codif)).encode(self.codif)#.split(';')
                        #article['authors'] = tuple(article['authors'])
                        except:
                            article['authors'] = 'not found'
                        line = 0
                        br_tag = 10
                
                
                try:
                    if article['date'] is not None or article['date'] != '':
                        try:
                            back = article['date']
                        except Exception as e: 
                            print(e)
                            pass
                    else:
                        try:
                            article['date'] = back
                        except Exception as e:
                            print(e)
                except :
                    article['date'] = datetime.now()


                article['uniqu_id'] = article['text'][-9]
                article['text'].pop()
                article['text'] = ' '.join(article['text'])
                article['text'] = re.sub('Tous droits réservés.*$', '', article['text'])

                article['bdd']  = 'europresse'
                article['url']  = ''
                
                self.data.append(article)
                article = {'source': "", 'volume': "", 'date': "", \
                        'authors': "", 'section': "", 'page':"", 'text': "", 'object_id':""}
                count += 1

    def add(self, project=None, corpus=None, user=None, ids=None):
        """ Appends notices to self.corpus from self.data removing duplicates"""
        if ids is not None:
            self.object_ids = ids
        else:
            self.object_ids = set()
        
        for i in self.data:
            if i['uniqu_id'] not in self.object_ids and isinstance(i['date'], datetime):
                self.object_ids.add(i['uniqu_id'])
                doc = Document()
                
                doc.project = project
                doc.user    = user

                doc.date    = i['date']
                doc.uniqu_id= i['uniqu_id']
                doc.title   = i['title']

                doc.source  = i['source']
                doc.authors = i['authors']
                doc.text    = i['text']

                doc.save()
                doc.corpus.add(corpus)

        self.data = []


def demo():
    import sys
    data = Europresse()
    try:
        pass
    except Exception as e:
        print("very usefull function", e)
    
        print(a['date'])


if __name__ == "__main__" :
    try:
        demo()
    except Exception as error:
        print(error)