adding europresse import at saving point

56fca2a2 · Administrator · fb33e646 · 56fca2a2 · 56fca2a2 · 56fca2a2
Commit 56fca2a2 authored Sep 29, 2014 by Administrator
8 changed files
--- a/documents/admin.py
+++ b/documents/admin.py
@@ -5,6 +5,9 @@ from nested_inlines.admin import NestedModelAdmin, NestedStackedInline, NestedTa

 from documents.models import Source, Language, Project, Corpus, Document, Ngram, NgramDocument, List, ListNgram

+from sources.europresse import Europresse
+from gargantext_web.settings import MEDIA_ROOT
+
 class DocumentInLine(admin.StackedInline):
    model = Document
    extra = 0
@@ -61,8 +64,24 @@ class CorpusAdmin(admin.ModelAdmin):
        if not change:
            obj.user = request.user
        obj.save()
-        for i in range(1,100000):
-            print("GOOOOOOOOOOOOOO")
+        print(obj.database, obj.language, obj.zip_file)
+        
+        try:
+# importer(
+            import zipfile
+            c = Europresse()
+            if zipfile.is_zipfile(obj.zip_file):
+                with zipfile.ZipFile(obj.zip_file, 'r') as z:
+                    for f in z.namelist():
+                        i = z.open(f, 'r')
+                        for l in i.readline():
+                            print(l)
+                        #c.importer(MEDIA_ROOT + "/" + str(f))
+
+#                    for article in c:
+#                        print(article['title'])
+        except Exception as e:
+            print(e)

 class DocumentAdmin(admin.ModelAdmin):
    exclude = ('user',)

--- a/gargantext_web_tutorial.ipynb
+++ b/gargantext_web_tutorial.ipynb
 {
 "metadata": {
  "name": "",
-  "signature": "sha256:9a933c547abe9d85a6fbe1af7d7d7371d37042e6ad63ac70b7e46acedf7f294f"
+  "signature": "sha256:c112732dc666c365db7529d3971cad98ecc2bd84cdea3dc8613b609abf19e262"
 },
 "nbformat": 3,
 "nbformat_minor": 0,
@@ -103,6 +103,26 @@
     "outputs": [],
     "prompt_number": 1
    },
+    {
+     "cell_type": "code",
+     "collapsed": false,
+     "input": [
+      "zipfile.is_zipfile(\"/tmp/date.zip\")"
+     ],
+     "language": "python",
+     "metadata": {},
+     "outputs": [
+      {
+       "metadata": {},
+       "output_type": "pyout",
+       "prompt_number": 2,
+       "text": [
+        "False"
+       ]
+      }
+     ],
+     "prompt_number": 2
+    },
    {
     "cell_type": "code",
     "collapsed": false,
@@ -115,20 +135,24 @@
     "metadata": {},
     "outputs": [
      {
-       "output_type": "stream",
-       "stream": "stdout",
-       "text": [
-        "date.txt\n"
+       "ename": "FileNotFoundError",
+       "evalue": "[Errno 2] No such file or directory: '/tmp/date.zip'",
+       "output_type": "pyerr",
+       "traceback": [
+        "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m\n\u001b[1;31mFileNotFoundError\u001b[0m                         Traceback (most recent call last)",
+        "\u001b[1;32m<ipython-input-3-62bd2ffbe177>\u001b[0m in \u001b[0;36m<module>\u001b[1;34m()\u001b[0m\n\u001b[1;32m----> 1\u001b[1;33m \u001b[1;32mwith\u001b[0m \u001b[0mzipfile\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mZipFile\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m\"/tmp/date.zip\"\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;34m'r'\u001b[0m\u001b[1;33m)\u001b[0m \u001b[1;32mas\u001b[0m \u001b[0mf\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m      2\u001b[0m     \u001b[1;32mfor\u001b[0m \u001b[0mx\u001b[0m \u001b[1;32min\u001b[0m \u001b[0mf\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mnamelist\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m      3\u001b[0m         \u001b[0mprint\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mx\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
+        "\u001b[1;32m/usr/lib/python3.4/zipfile.py\u001b[0m in \u001b[0;36m__init__\u001b[1;34m(self, file, mode, compression, allowZip64)\u001b[0m\n\u001b[0;32m    921\u001b[0m             \u001b[0mmodeDict\u001b[0m \u001b[1;33m=\u001b[0m \u001b[1;33m{\u001b[0m\u001b[1;34m'r'\u001b[0m \u001b[1;33m:\u001b[0m \u001b[1;34m'rb'\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;34m'w'\u001b[0m\u001b[1;33m:\u001b[0m \u001b[1;34m'wb'\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;34m'a'\u001b[0m \u001b[1;33m:\u001b[0m \u001b[1;34m'r+b'\u001b[0m\u001b[1;33m}\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    922\u001b[0m             \u001b[1;32mtry\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 923\u001b[1;33m                 \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mfp\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mio\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mopen\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mfile\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mmodeDict\u001b[0m\u001b[1;33m[\u001b[0m\u001b[0mmode\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m    924\u001b[0m             \u001b[1;32mexcept\u001b[0m \u001b[0mOSError\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    925\u001b[0m                 \u001b[1;32mif\u001b[0m \u001b[0mmode\u001b[0m \u001b[1;33m==\u001b[0m \u001b[1;34m'a'\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
+        "\u001b[1;31mFileNotFoundError\u001b[0m: [Errno 2] No such file or directory: '/tmp/date.zip'"
       ]
      }
     ],
-     "prompt_number": 12
+     "prompt_number": 3
    },
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": [
-      "# Imporation\n",
+      "# Importation\n",
      "## Europresse"
     ]
    },

--- a/sources/__init__.py
+++ b/sources/__init__.py
--- a/sources/admin.py
+++ b/sources/admin.py
+from django.contrib import admin
+
+# Register your models here.
--- a/sources/europresse.py
+++ b/sources/europresse.py
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+""" 
+Europresse Database parser for HTML sources only. 
+
+This script is using 3 methods of parsing:
+
+    1) REGEX (Regular Expressions) format detection
+    2) SAX (Simple Api for Xml) like method for events detection
+    3) DOM (Document Object Model), operating on the document as a whole for
+    tree detection.
+
+Bug reports? Please contact the author:
+__author__ : alexandre+gargantext @ delanoe.org
+__licence__ : GPL version 3.0+
+__DATE__ : 09 november 2013
+__VERSION__ : 2.0
+"""
+
+import os
+import sys
+import imp
+imp.reload(sys)
+sys.path.append("../../gargantext/")
+import re
+
+import locale
+
+from datetime import datetime, date
+from lxml import etree
+
+from documents.models import Document
+#from .corpus import Corpus
+
+class Europresse(Document):
+    """
+    1) First build tree to parse data
+    2) Then each notice (article) is nested in a dictionary,
+    3) Finaly, corpus is a list of articles as dictionnaries.
+    """
+
+    def __init__(self):
+        """self.corpus is a list
+        articles is the list of articles in the HTML page
+        article is an article as dict"""
+        # I do not think this initialisation is usefull
+        Document.__init__(self)
+
+        # Specific declarations for Europress
+        self.data       = []
+
+        # Encoding
+        self.codif      = "UTF-8"
+        self.localeEncoding = "fr_FR"
+
+    def test_unicode(self, filename):
+        import os
+        os.system("file_europresse=$(mktemp -q); file --mime-encoding \'%s\' | grep -i -- \"iso-8859\" && \
+                iconv -f latin1 -t utf8 \'%s\' > $file_europresse && \
+                mv $file_europresse \'%s\'" % (filename, filename, filename))
+
+    def parse(self, filename):
+        """Adding filename to self.data after parsing"""
+        count = 0
+        articles   = []
+        article    = {}
+
+        parser = etree.HTMLParser(encoding=self.codif)
+        tree = etree.parse(filename, parser)
+        articles = tree.xpath('/html/body/table')
+
+        for notice in articles:
+            if len(notice):
+                for name in notice.xpath("./tr/td/span[@class = 'DocPublicationName']"):
+                    if name.text is not None:
+                        format_journal = re.compile('(.*), (.*)', re.UNICODE)
+                        test_journal = format_journal.match(name.text)
+                        if test_journal is not None:
+                            article['source'] = test_journal.group(1)
+                            article['volume'] = test_journal.group(2)
+                        else:
+                            article['source'] = name.text.encode(self.codif)
+
+                for header in notice.xpath("./tr/td/span[@class = 'DocHeader']"):
+                    text = header.text
+                    if isinstance(text, bytes):
+                        text = text.decode()
+
+                    format_date_fr = re.compile('\d+\s*\w+\s+\d{4}', re.UNICODE)
+                    test_date_fr = format_date_fr.match(text)
+                    
+                    format_date_en = re.compile('\w+\s+\d+,\s+\d{4}', re.UNICODE)
+                    test_date_en = format_date_en.match(text)
+
+                    format_sect = re.compile('(\D+),', re.UNICODE)
+                    test_sect = format_sect.match(text)
+                    
+                    format_page = re.compile(', p. (\w+)', re.UNICODE)
+                    test_page = format_page.match(text)
+                    
+                    if test_date_fr is not None:
+                        self.localeEncoding = "fr_FR"
+                        locale.setlocale(locale.LC_ALL, self.localeEncoding)
+                        try :
+                            article['date'] = datetime.strptime(text, '%d %B %Y')
+                        except :
+                            try:
+                                article['date'] = datetime.strptime(text, '%B %Y')
+                            except :
+                                pass
+                    
+                    if test_date_en is not None:
+                        self.localeEncoding = "en_GB.UTF-8"
+                        locale.setlocale(locale.LC_ALL, self.localeEncoding)
+                        try :
+                            article['date'] = datetime.strptime(text, '%B %d, %Y')
+                        except :
+                            try :
+                                article['date'] = datetime.strptime(text, '%B %Y')
+                            except :
+                                pass
+
+                    if test_sect is not None:
+                        article['section'] = test_sect.group(1).encode(self.codif)
+                    
+                    if test_page is not None:
+                        article['page'] = test_page.group(1).encode(self.codif)
+
+                article['title'] = notice.xpath("string(./tr/td/span[@class = 'TitreArticleVisu'])").encode(self.codif)
+                article['text'] = notice.xpath("./tr/td/descendant-or-self::*[not(self::span[@class='DocHeader'])]/text()")
+               
+                line = 0
+                br_tag = 10
+                for i in articles[count].iter():
+                   # print line, br, i, i.tag, i.attrib, i.tail
+                    if i.tag == "span":
+                        if "class" in i.attrib:
+                            if i.attrib['class'] == 'TitreArticleVisu':
+                                line = 1
+                                br_tag = 2
+                    if line == 1 and i.tag == "br":
+                        br_tag -= 1
+                    if line == 1 and br_tag == 0:
+                        try:
+                            article['authors'] = str.title(etree.tostring(i, method="text", encoding=self.codif)).encode(self.codif)#.split(';')
+                        #article['authors'] = tuple(article['authors'])
+                        except:
+                            article['authors'] = 'not found'
+                        line = 0
+                        br_tag = 10
+                
+                
+                try:
+                    if article['date'] is not None or article['date'] != '':
+                        try:
+                            back = article['date']
+                        except Exception as e: 
+                            print(e)
+                            pass
+                    else:
+                        try:
+                            article['date'] = back
+                        except Exception as e:
+                            print(e)
+                except :
+                    article['date'] = datetime.now()
+
+
+                article['object_id'] = article['text'][-9]
+                article['text'].pop()
+                article['text'] = ' '.join(article['text'])
+                article['text'] = re.sub('Tous droits réservés.*$', '', article['text'])
+
+                article['bdd']  = 'europresse'
+                article['url']  = ''
+                
+                self.data.append(article)
+                article = {'source': "", 'volume': "", 'date': "", \
+                        'authors': "", 'section': "", 'page':"", 'text': "", 'object_id':""}
+                count += 1
+
+    def ajouter(self):
+        """ Appends notices to self.corpus from self.data removing duplicates"""
+        for i in self.data:
+            if i['object_id'] not in self.object_ids and isinstance(i['date'], datetime):
+                self.object_ids.append(i['object_id'])
+                self.append(i)
+        self.data = []
+
+    def importer(self, file):
+        print('file being parsed by europress parser: ', file)
+        try:
+            self.parse(file)
+            self.ajouter()
+        except Exception as e:
+            print("Error parsing", e)
+
+    def add(self, file=None):
+        import glob
+        import sys
+        if file is not None:
+            files = glob.glob( file + "/*html")
+            for file in files:
+                #self.test_unicode(file)
+                self.parse(file)
+            self.ajouter()
+        else:
+            print("Usage: self.add(\"your file\")")
+
+
+def demo():
+    import sys
+    data = Europresse()
+    try:
+        data.add(sys.argv[1])
+    except Exception as e:
+        print(e)
+    #data.add('../data/html/html_english/')
+    
+    for a in data.corpus:
+        print(a['date'])
+    #print(len(data.corpus))
+
+
+if __name__ == "__main__" :
+    try:
+        demo()
+    except Exception as error:
+        print(error)
--- a/sources/models.py
+++ b/sources/models.py
+from django.db import models
+
+# Create your models here.
--- a/sources/tests.py
+++ b/sources/tests.py
+from django.test import TestCase
+
+# Create your tests here.
--- a/sources/views.py
+++ b/sources/views.py
+from django.shortcuts import render
+
+# Create your views here.