Commit 56fca2a2 authored by Administrator's avatar Administrator

adding europresse import at saving point

parent fb33e646
...@@ -5,6 +5,9 @@ from nested_inlines.admin import NestedModelAdmin, NestedStackedInline, NestedTa ...@@ -5,6 +5,9 @@ from nested_inlines.admin import NestedModelAdmin, NestedStackedInline, NestedTa
from documents.models import Source, Language, Project, Corpus, Document, Ngram, NgramDocument, List, ListNgram from documents.models import Source, Language, Project, Corpus, Document, Ngram, NgramDocument, List, ListNgram
from sources.europresse import Europresse
from gargantext_web.settings import MEDIA_ROOT
class DocumentInLine(admin.StackedInline): class DocumentInLine(admin.StackedInline):
model = Document model = Document
extra = 0 extra = 0
...@@ -61,8 +64,24 @@ class CorpusAdmin(admin.ModelAdmin): ...@@ -61,8 +64,24 @@ class CorpusAdmin(admin.ModelAdmin):
if not change: if not change:
obj.user = request.user obj.user = request.user
obj.save() obj.save()
for i in range(1,100000): print(obj.database, obj.language, obj.zip_file)
print("GOOOOOOOOOOOOOO")
try:
# importer(
import zipfile
c = Europresse()
if zipfile.is_zipfile(obj.zip_file):
with zipfile.ZipFile(obj.zip_file, 'r') as z:
for f in z.namelist():
i = z.open(f, 'r')
for l in i.readline():
print(l)
#c.importer(MEDIA_ROOT + "/" + str(f))
# for article in c:
# print(article['title'])
except Exception as e:
print(e)
class DocumentAdmin(admin.ModelAdmin): class DocumentAdmin(admin.ModelAdmin):
exclude = ('user',) exclude = ('user',)
......
{ {
"metadata": { "metadata": {
"name": "", "name": "",
"signature": "sha256:9a933c547abe9d85a6fbe1af7d7d7371d37042e6ad63ac70b7e46acedf7f294f" "signature": "sha256:c112732dc666c365db7529d3971cad98ecc2bd84cdea3dc8613b609abf19e262"
}, },
"nbformat": 3, "nbformat": 3,
"nbformat_minor": 0, "nbformat_minor": 0,
...@@ -103,6 +103,26 @@ ...@@ -103,6 +103,26 @@
"outputs": [], "outputs": [],
"prompt_number": 1 "prompt_number": 1
}, },
{
"cell_type": "code",
"collapsed": false,
"input": [
"zipfile.is_zipfile(\"/tmp/date.zip\")"
],
"language": "python",
"metadata": {},
"outputs": [
{
"metadata": {},
"output_type": "pyout",
"prompt_number": 2,
"text": [
"False"
]
}
],
"prompt_number": 2
},
{ {
"cell_type": "code", "cell_type": "code",
"collapsed": false, "collapsed": false,
...@@ -115,20 +135,24 @@ ...@@ -115,20 +135,24 @@
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
{ {
"output_type": "stream", "ename": "FileNotFoundError",
"stream": "stdout", "evalue": "[Errno 2] No such file or directory: '/tmp/date.zip'",
"text": [ "output_type": "pyerr",
"date.txt\n" "traceback": [
"\u001b[1;31m---------------------------------------------------------------------------\u001b[0m\n\u001b[1;31mFileNotFoundError\u001b[0m Traceback (most recent call last)",
"\u001b[1;32m<ipython-input-3-62bd2ffbe177>\u001b[0m in \u001b[0;36m<module>\u001b[1;34m()\u001b[0m\n\u001b[1;32m----> 1\u001b[1;33m \u001b[1;32mwith\u001b[0m \u001b[0mzipfile\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mZipFile\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m\"/tmp/date.zip\"\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;34m'r'\u001b[0m\u001b[1;33m)\u001b[0m \u001b[1;32mas\u001b[0m \u001b[0mf\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 2\u001b[0m \u001b[1;32mfor\u001b[0m \u001b[0mx\u001b[0m \u001b[1;32min\u001b[0m \u001b[0mf\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mnamelist\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 3\u001b[0m \u001b[0mprint\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mx\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
"\u001b[1;32m/usr/lib/python3.4/zipfile.py\u001b[0m in \u001b[0;36m__init__\u001b[1;34m(self, file, mode, compression, allowZip64)\u001b[0m\n\u001b[0;32m 921\u001b[0m \u001b[0mmodeDict\u001b[0m \u001b[1;33m=\u001b[0m \u001b[1;33m{\u001b[0m\u001b[1;34m'r'\u001b[0m \u001b[1;33m:\u001b[0m \u001b[1;34m'rb'\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;34m'w'\u001b[0m\u001b[1;33m:\u001b[0m \u001b[1;34m'wb'\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;34m'a'\u001b[0m \u001b[1;33m:\u001b[0m \u001b[1;34m'r+b'\u001b[0m\u001b[1;33m}\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 922\u001b[0m \u001b[1;32mtry\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 923\u001b[1;33m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mfp\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mio\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mopen\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mfile\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mmodeDict\u001b[0m\u001b[1;33m[\u001b[0m\u001b[0mmode\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 924\u001b[0m \u001b[1;32mexcept\u001b[0m \u001b[0mOSError\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 925\u001b[0m \u001b[1;32mif\u001b[0m \u001b[0mmode\u001b[0m \u001b[1;33m==\u001b[0m \u001b[1;34m'a'\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
"\u001b[1;31mFileNotFoundError\u001b[0m: [Errno 2] No such file or directory: '/tmp/date.zip'"
] ]
} }
], ],
"prompt_number": 12 "prompt_number": 3
}, },
{ {
"cell_type": "markdown", "cell_type": "markdown",
"metadata": {}, "metadata": {},
"source": [ "source": [
"# Imporation\n", "# Importation\n",
"## Europresse" "## Europresse"
] ]
}, },
......
from django.contrib import admin
# Register your models here.
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
Europresse Database parser for HTML sources only.
This script is using 3 methods of parsing:
1) REGEX (Regular Expressions) format detection
2) SAX (Simple Api for Xml) like method for events detection
3) DOM (Document Object Model), operating on the document as a whole for
tree detection.
Bug reports? Please contact the author:
__author__ : alexandre+gargantext @ delanoe.org
__licence__ : GPL version 3.0+
__DATE__ : 09 november 2013
__VERSION__ : 2.0
"""
import os
import sys
import imp
imp.reload(sys)
sys.path.append("../../gargantext/")
import re
import locale
from datetime import datetime, date
from lxml import etree
from documents.models import Document
#from .corpus import Corpus
class Europresse(Document):
"""
1) First build tree to parse data
2) Then each notice (article) is nested in a dictionary,
3) Finaly, corpus is a list of articles as dictionnaries.
"""
def __init__(self):
"""self.corpus is a list
articles is the list of articles in the HTML page
article is an article as dict"""
# I do not think this initialisation is usefull
Document.__init__(self)
# Specific declarations for Europress
self.data = []
# Encoding
self.codif = "UTF-8"
self.localeEncoding = "fr_FR"
def test_unicode(self, filename):
import os
os.system("file_europresse=$(mktemp -q); file --mime-encoding \'%s\' | grep -i -- \"iso-8859\" && \
iconv -f latin1 -t utf8 \'%s\' > $file_europresse && \
mv $file_europresse \'%s\'" % (filename, filename, filename))
def parse(self, filename):
"""Adding filename to self.data after parsing"""
count = 0
articles = []
article = {}
parser = etree.HTMLParser(encoding=self.codif)
tree = etree.parse(filename, parser)
articles = tree.xpath('/html/body/table')
for notice in articles:
if len(notice):
for name in notice.xpath("./tr/td/span[@class = 'DocPublicationName']"):
if name.text is not None:
format_journal = re.compile('(.*), (.*)', re.UNICODE)
test_journal = format_journal.match(name.text)
if test_journal is not None:
article['source'] = test_journal.group(1)
article['volume'] = test_journal.group(2)
else:
article['source'] = name.text.encode(self.codif)
for header in notice.xpath("./tr/td/span[@class = 'DocHeader']"):
text = header.text
if isinstance(text, bytes):
text = text.decode()
format_date_fr = re.compile('\d+\s*\w+\s+\d{4}', re.UNICODE)
test_date_fr = format_date_fr.match(text)
format_date_en = re.compile('\w+\s+\d+,\s+\d{4}', re.UNICODE)
test_date_en = format_date_en.match(text)
format_sect = re.compile('(\D+),', re.UNICODE)
test_sect = format_sect.match(text)
format_page = re.compile(', p. (\w+)', re.UNICODE)
test_page = format_page.match(text)
if test_date_fr is not None:
self.localeEncoding = "fr_FR"
locale.setlocale(locale.LC_ALL, self.localeEncoding)
try :
article['date'] = datetime.strptime(text, '%d %B %Y')
except :
try:
article['date'] = datetime.strptime(text, '%B %Y')
except :
pass
if test_date_en is not None:
self.localeEncoding = "en_GB.UTF-8"
locale.setlocale(locale.LC_ALL, self.localeEncoding)
try :
article['date'] = datetime.strptime(text, '%B %d, %Y')
except :
try :
article['date'] = datetime.strptime(text, '%B %Y')
except :
pass
if test_sect is not None:
article['section'] = test_sect.group(1).encode(self.codif)
if test_page is not None:
article['page'] = test_page.group(1).encode(self.codif)
article['title'] = notice.xpath("string(./tr/td/span[@class = 'TitreArticleVisu'])").encode(self.codif)
article['text'] = notice.xpath("./tr/td/descendant-or-self::*[not(self::span[@class='DocHeader'])]/text()")
line = 0
br_tag = 10
for i in articles[count].iter():
# print line, br, i, i.tag, i.attrib, i.tail
if i.tag == "span":
if "class" in i.attrib:
if i.attrib['class'] == 'TitreArticleVisu':
line = 1
br_tag = 2
if line == 1 and i.tag == "br":
br_tag -= 1
if line == 1 and br_tag == 0:
try:
article['authors'] = str.title(etree.tostring(i, method="text", encoding=self.codif)).encode(self.codif)#.split(';')
#article['authors'] = tuple(article['authors'])
except:
article['authors'] = 'not found'
line = 0
br_tag = 10
try:
if article['date'] is not None or article['date'] != '':
try:
back = article['date']
except Exception as e:
print(e)
pass
else:
try:
article['date'] = back
except Exception as e:
print(e)
except :
article['date'] = datetime.now()
article['object_id'] = article['text'][-9]
article['text'].pop()
article['text'] = ' '.join(article['text'])
article['text'] = re.sub('Tous droits réservés.*$', '', article['text'])
article['bdd'] = 'europresse'
article['url'] = ''
self.data.append(article)
article = {'source': "", 'volume': "", 'date': "", \
'authors': "", 'section': "", 'page':"", 'text': "", 'object_id':""}
count += 1
def ajouter(self):
""" Appends notices to self.corpus from self.data removing duplicates"""
for i in self.data:
if i['object_id'] not in self.object_ids and isinstance(i['date'], datetime):
self.object_ids.append(i['object_id'])
self.append(i)
self.data = []
def importer(self, file):
print('file being parsed by europress parser: ', file)
try:
self.parse(file)
self.ajouter()
except Exception as e:
print("Error parsing", e)
def add(self, file=None):
import glob
import sys
if file is not None:
files = glob.glob( file + "/*html")
for file in files:
#self.test_unicode(file)
self.parse(file)
self.ajouter()
else:
print("Usage: self.add(\"your file\")")
def demo():
import sys
data = Europresse()
try:
data.add(sys.argv[1])
except Exception as e:
print(e)
#data.add('../data/html/html_english/')
for a in data.corpus:
print(a['date'])
#print(len(data.corpus))
if __name__ == "__main__" :
try:
demo()
except Exception as error:
print(error)
from django.db import models
# Create your models here.
from django.test import TestCase
# Create your tests here.
from django.shortcuts import render
# Create your views here.
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment