Commit 56fca2a2 authored by Administrator's avatar Administrator

adding europresse import at saving point

parent fb33e646
......@@ -5,6 +5,9 @@ from nested_inlines.admin import NestedModelAdmin, NestedStackedInline, NestedTa
from documents.models import Source, Language, Project, Corpus, Document, Ngram, NgramDocument, List, ListNgram
from sources.europresse import Europresse
from gargantext_web.settings import MEDIA_ROOT
class DocumentInLine(admin.StackedInline):
model = Document
extra = 0
......@@ -61,8 +64,24 @@ class CorpusAdmin(admin.ModelAdmin):
if not change:
obj.user = request.user
obj.save()
for i in range(1,100000):
print("GOOOOOOOOOOOOOO")
print(obj.database, obj.language, obj.zip_file)
try:
# importer(
import zipfile
c = Europresse()
if zipfile.is_zipfile(obj.zip_file):
with zipfile.ZipFile(obj.zip_file, 'r') as z:
for f in z.namelist():
i = z.open(f, 'r')
for l in i.readline():
print(l)
#c.importer(MEDIA_ROOT + "/" + str(f))
# for article in c:
# print(article['title'])
except Exception as e:
print(e)
class DocumentAdmin(admin.ModelAdmin):
exclude = ('user',)
......
{
"metadata": {
"name": "",
"signature": "sha256:9a933c547abe9d85a6fbe1af7d7d7371d37042e6ad63ac70b7e46acedf7f294f"
"signature": "sha256:c112732dc666c365db7529d3971cad98ecc2bd84cdea3dc8613b609abf19e262"
},
"nbformat": 3,
"nbformat_minor": 0,
......@@ -103,6 +103,26 @@
"outputs": [],
"prompt_number": 1
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"zipfile.is_zipfile(\"/tmp/date.zip\")"
],
"language": "python",
"metadata": {},
"outputs": [
{
"metadata": {},
"output_type": "pyout",
"prompt_number": 2,
"text": [
"False"
]
}
],
"prompt_number": 2
},
{
"cell_type": "code",
"collapsed": false,
......@@ -115,20 +135,24 @@
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
"date.txt\n"
"ename": "FileNotFoundError",
"evalue": "[Errno 2] No such file or directory: '/tmp/date.zip'",
"output_type": "pyerr",
"traceback": [
"\u001b[1;31m---------------------------------------------------------------------------\u001b[0m\n\u001b[1;31mFileNotFoundError\u001b[0m Traceback (most recent call last)",
"\u001b[1;32m<ipython-input-3-62bd2ffbe177>\u001b[0m in \u001b[0;36m<module>\u001b[1;34m()\u001b[0m\n\u001b[1;32m----> 1\u001b[1;33m \u001b[1;32mwith\u001b[0m \u001b[0mzipfile\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mZipFile\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m\"/tmp/date.zip\"\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;34m'r'\u001b[0m\u001b[1;33m)\u001b[0m \u001b[1;32mas\u001b[0m \u001b[0mf\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 2\u001b[0m \u001b[1;32mfor\u001b[0m \u001b[0mx\u001b[0m \u001b[1;32min\u001b[0m \u001b[0mf\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mnamelist\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 3\u001b[0m \u001b[0mprint\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mx\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
"\u001b[1;32m/usr/lib/python3.4/zipfile.py\u001b[0m in \u001b[0;36m__init__\u001b[1;34m(self, file, mode, compression, allowZip64)\u001b[0m\n\u001b[0;32m 921\u001b[0m \u001b[0mmodeDict\u001b[0m \u001b[1;33m=\u001b[0m \u001b[1;33m{\u001b[0m\u001b[1;34m'r'\u001b[0m \u001b[1;33m:\u001b[0m \u001b[1;34m'rb'\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;34m'w'\u001b[0m\u001b[1;33m:\u001b[0m \u001b[1;34m'wb'\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;34m'a'\u001b[0m \u001b[1;33m:\u001b[0m \u001b[1;34m'r+b'\u001b[0m\u001b[1;33m}\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 922\u001b[0m \u001b[1;32mtry\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 923\u001b[1;33m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mfp\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mio\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mopen\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mfile\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mmodeDict\u001b[0m\u001b[1;33m[\u001b[0m\u001b[0mmode\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 924\u001b[0m \u001b[1;32mexcept\u001b[0m \u001b[0mOSError\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 925\u001b[0m \u001b[1;32mif\u001b[0m \u001b[0mmode\u001b[0m \u001b[1;33m==\u001b[0m \u001b[1;34m'a'\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
"\u001b[1;31mFileNotFoundError\u001b[0m: [Errno 2] No such file or directory: '/tmp/date.zip'"
]
}
],
"prompt_number": 12
"prompt_number": 3
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Imporation\n",
"# Importation\n",
"## Europresse"
]
},
......
from django.contrib import admin
# Register your models here.
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
Europresse Database parser for HTML sources only.
This script is using 3 methods of parsing:
1) REGEX (Regular Expressions) format detection
2) SAX (Simple Api for Xml) like method for events detection
3) DOM (Document Object Model), operating on the document as a whole for
tree detection.
Bug reports? Please contact the author:
__author__ : alexandre+gargantext @ delanoe.org
__licence__ : GPL version 3.0+
__DATE__ : 09 november 2013
__VERSION__ : 2.0
"""
import os
import sys
import imp
imp.reload(sys)
sys.path.append("../../gargantext/")
import re
import locale
from datetime import datetime, date
from lxml import etree
from documents.models import Document
#from .corpus import Corpus
class Europresse(Document):
"""
1) First build tree to parse data
2) Then each notice (article) is nested in a dictionary,
3) Finaly, corpus is a list of articles as dictionnaries.
"""
def __init__(self):
"""self.corpus is a list
articles is the list of articles in the HTML page
article is an article as dict"""
# I do not think this initialisation is usefull
Document.__init__(self)
# Specific declarations for Europress
self.data = []
# Encoding
self.codif = "UTF-8"
self.localeEncoding = "fr_FR"
def test_unicode(self, filename):
import os
os.system("file_europresse=$(mktemp -q); file --mime-encoding \'%s\' | grep -i -- \"iso-8859\" && \
iconv -f latin1 -t utf8 \'%s\' > $file_europresse && \
mv $file_europresse \'%s\'" % (filename, filename, filename))
def parse(self, filename):
"""Adding filename to self.data after parsing"""
count = 0
articles = []
article = {}
parser = etree.HTMLParser(encoding=self.codif)
tree = etree.parse(filename, parser)
articles = tree.xpath('/html/body/table')
for notice in articles:
if len(notice):
for name in notice.xpath("./tr/td/span[@class = 'DocPublicationName']"):
if name.text is not None:
format_journal = re.compile('(.*), (.*)', re.UNICODE)
test_journal = format_journal.match(name.text)
if test_journal is not None:
article['source'] = test_journal.group(1)
article['volume'] = test_journal.group(2)
else:
article['source'] = name.text.encode(self.codif)
for header in notice.xpath("./tr/td/span[@class = 'DocHeader']"):
text = header.text
if isinstance(text, bytes):
text = text.decode()
format_date_fr = re.compile('\d+\s*\w+\s+\d{4}', re.UNICODE)
test_date_fr = format_date_fr.match(text)
format_date_en = re.compile('\w+\s+\d+,\s+\d{4}', re.UNICODE)
test_date_en = format_date_en.match(text)
format_sect = re.compile('(\D+),', re.UNICODE)
test_sect = format_sect.match(text)
format_page = re.compile(', p. (\w+)', re.UNICODE)
test_page = format_page.match(text)
if test_date_fr is not None:
self.localeEncoding = "fr_FR"
locale.setlocale(locale.LC_ALL, self.localeEncoding)
try :
article['date'] = datetime.strptime(text, '%d %B %Y')
except :
try:
article['date'] = datetime.strptime(text, '%B %Y')
except :
pass
if test_date_en is not None:
self.localeEncoding = "en_GB.UTF-8"
locale.setlocale(locale.LC_ALL, self.localeEncoding)
try :
article['date'] = datetime.strptime(text, '%B %d, %Y')
except :
try :
article['date'] = datetime.strptime(text, '%B %Y')
except :
pass
if test_sect is not None:
article['section'] = test_sect.group(1).encode(self.codif)
if test_page is not None:
article['page'] = test_page.group(1).encode(self.codif)
article['title'] = notice.xpath("string(./tr/td/span[@class = 'TitreArticleVisu'])").encode(self.codif)
article['text'] = notice.xpath("./tr/td/descendant-or-self::*[not(self::span[@class='DocHeader'])]/text()")
line = 0
br_tag = 10
for i in articles[count].iter():
# print line, br, i, i.tag, i.attrib, i.tail
if i.tag == "span":
if "class" in i.attrib:
if i.attrib['class'] == 'TitreArticleVisu':
line = 1
br_tag = 2
if line == 1 and i.tag == "br":
br_tag -= 1
if line == 1 and br_tag == 0:
try:
article['authors'] = str.title(etree.tostring(i, method="text", encoding=self.codif)).encode(self.codif)#.split(';')
#article['authors'] = tuple(article['authors'])
except:
article['authors'] = 'not found'
line = 0
br_tag = 10
try:
if article['date'] is not None or article['date'] != '':
try:
back = article['date']
except Exception as e:
print(e)
pass
else:
try:
article['date'] = back
except Exception as e:
print(e)
except :
article['date'] = datetime.now()
article['object_id'] = article['text'][-9]
article['text'].pop()
article['text'] = ' '.join(article['text'])
article['text'] = re.sub('Tous droits réservés.*$', '', article['text'])
article['bdd'] = 'europresse'
article['url'] = ''
self.data.append(article)
article = {'source': "", 'volume': "", 'date': "", \
'authors': "", 'section': "", 'page':"", 'text': "", 'object_id':""}
count += 1
def ajouter(self):
""" Appends notices to self.corpus from self.data removing duplicates"""
for i in self.data:
if i['object_id'] not in self.object_ids and isinstance(i['date'], datetime):
self.object_ids.append(i['object_id'])
self.append(i)
self.data = []
def importer(self, file):
print('file being parsed by europress parser: ', file)
try:
self.parse(file)
self.ajouter()
except Exception as e:
print("Error parsing", e)
def add(self, file=None):
import glob
import sys
if file is not None:
files = glob.glob( file + "/*html")
for file in files:
#self.test_unicode(file)
self.parse(file)
self.ajouter()
else:
print("Usage: self.add(\"your file\")")
def demo():
import sys
data = Europresse()
try:
data.add(sys.argv[1])
except Exception as e:
print(e)
#data.add('../data/html/html_english/')
for a in data.corpus:
print(a['date'])
#print(len(data.corpus))
if __name__ == "__main__" :
try:
demo()
except Exception as error:
print(error)
from django.db import models
# Create your models here.
from django.test import TestCase
# Create your tests here.
from django.shortcuts import render
# Create your views here.
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment