Commit b67643b0 authored by Administrator's avatar Administrator

importateur europresse (with previous ids) ok, isi:ok, bud date for ris

parent 9968bfff
{
"metadata": {
"name": "",
"signature": "sha256:2afae28d08bbb0945aaca44a5b704550048c5dc193cc3d81cb11a551fcc03864"
"signature": "sha256:9a933c547abe9d85a6fbe1af7d7d7371d37042e6ad63ac70b7e46acedf7f294f"
},
"nbformat": 3,
"nbformat_minor": 0,
......@@ -92,6 +92,38 @@
"outputs": [],
"prompt_number": 6
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"import zipfile"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 1
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"with zipfile.ZipFile(\"/tmp/date.zip\", 'r') as f:\n",
" for x in f.namelist():\n",
" print(x)"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
"date.txt\n"
]
}
],
"prompt_number": 12
},
{
"cell_type": "markdown",
"metadata": {},
......@@ -696,32 +728,150 @@
"outputs": [],
"prompt_number": 12
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#REDIS"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [],
"input": [
"BROKER_URL = 'redis://localhost:6379/0'\n",
"# redis://:password@hostname:port/db_number"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 12
"prompt_number": 1
},
{
"cell_type": "code",
"collapsed": false,
"input": [],
"input": [
"BROKER_TRANSPORT_OPTIONS = {'visibility_timeout': 3600} # 1 hour."
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 12
"prompt_number": 2
},
{
"cell_type": "code",
"collapsed": false,
"input": [],
"input": [
"CELERY_RESULT_BACKEND = 'redis://localhost:6379/0'"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 91
"prompt_number": 3
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"BROKER_TRANSPORT_OPTIONS = {'fanout_prefix': True}"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 4
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"BROKER_TRANSPORT_OPTIONS = {'visibility_timeout': 43200}"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 5
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"from __future__ import absolute_import\n",
"\n",
"from celery import Celery\n",
"\n",
"app = Celery('proj',\n",
" broker='redis://localhost:6379/0',\n",
" backend='redis://localhost:6379/0',\n",
" include=['proj.tasks'])\n",
"\n",
"# Optional configuration, see the application user guide.\n",
"app.conf.update(\n",
" CELERY_TASK_RESULT_EXPIRES=3600,\n",
")\n",
"\n",
"if __name__ == '__main__':\n",
" pass#app.start()"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 9
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"from __future__ import absolute_import\n",
"\n",
"\n",
"@app.task\n",
"def add(x, y):\n",
" return x + y\n"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 10
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"app.send_task(add(3, 1000))"
],
"language": "python",
"metadata": {},
"outputs": [
{
"metadata": {},
"output_type": "pyout",
"prompt_number": 15,
"text": [
"<AsyncResult: c4807752-eb28-4e0f-b8b9-fce8267bddd3>"
]
}
],
"prompt_number": 15
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"celery -A proj worker --loglevel=info"
],
"language": "python",
"metadata": {},
"outputs": [
{
"ename": "SyntaxError",
"evalue": "invalid syntax (<ipython-input-16-5806eb0c4fe2>, line 1)",
"output_type": "pyerr",
"traceback": [
"\u001b[1;36m File \u001b[1;32m\"<ipython-input-16-5806eb0c4fe2>\"\u001b[1;36m, line \u001b[1;32m1\u001b[0m\n\u001b[1;33m celery -A proj worker --loglevel=info\u001b[0m\n\u001b[1;37m ^\u001b[0m\n\u001b[1;31mSyntaxError\u001b[0m\u001b[1;31m:\u001b[0m invalid syntax\n"
]
}
],
"prompt_number": 16
},
{
"cell_type": "code",
......
{
"metadata": {
"name": "",
"signature": "sha256:c112732dc666c365db7529d3971cad98ecc2bd84cdea3dc8613b609abf19e262"
"signature": "sha256:fd8cdda63e0e9cba7dbdfac864550d69bace0f3f834ebd216402a50bc10992b1"
},
"nbformat": 3,
"nbformat_minor": 0,
......@@ -148,6 +148,61 @@
],
"prompt_number": 3
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"from lxml import etree"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 5
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"help(etree.parse)"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
"Help on built-in function parse in module lxml.etree:\n",
"\n",
"parse(...)\n",
" parse(source, parser=None, base_url=None)\n",
" \n",
" Return an ElementTree object loaded with source elements. If no parser\n",
" is provided as second argument, the default parser is used.\n",
" \n",
" The ``source`` can be any of the following:\n",
" \n",
" - a file name/path\n",
" - a file object\n",
" - a file-like object\n",
" - a URL using the HTTP or FTP protocol\n",
" \n",
" To parse from a string, use the ``fromstring()`` function instead.\n",
" \n",
" Note that it is generally faster to parse from a file path or URL\n",
" than from an open file object or file-like object. Transparent\n",
" decompression from gzip compressed sources is supported (unless\n",
" explicitly disabled in libxml2).\n",
" \n",
" The ``base_url`` keyword allows setting a URL for the document\n",
" when parsing from a file-like object. This is needed when looking\n",
" up external entities (DTD, XInclude, ...) with relative paths.\n",
"\n"
]
}
],
"prompt_number": 6
},
{
"cell_type": "markdown",
"metadata": {},
......
......@@ -31,7 +31,7 @@ from lxml import etree
from documents.models import Document
#from .corpus import Corpus
class Europresse(Document):
class Europresse():
"""
1) First build tree to parse data
2) Then each notice (article) is nested in a dictionary,
......@@ -45,7 +45,6 @@ class Europresse(Document):
# Specific declarations for Europresse
self.data = []
self.object_ids = []
# Encoding
self.codif = "UTF-8"
......@@ -177,11 +176,16 @@ class Europresse(Document):
'authors': "", 'section': "", 'page':"", 'text': "", 'object_id':""}
count += 1
def add(self, project=None, corpus=None, user=None):
def add(self, project=None, corpus=None, user=None, ids=None):
""" Appends notices to self.corpus from self.data removing duplicates"""
if ids is not None:
self.object_ids = ids
else:
self.object_ids = set()
for i in self.data:
if i['uniqu_id'] not in self.object_ids and isinstance(i['date'], datetime):
self.object_ids.append(i['uniqu_id'])
self.object_ids.add(i['uniqu_id'])
doc = Document()
doc.project = project
......@@ -190,7 +194,6 @@ class Europresse(Document):
doc.date = i['date']
doc.uniqu_id= i['uniqu_id']
doc.title = i['title']
print(doc.project)
doc.source = i['source']
doc.authors = i['authors']
......@@ -210,7 +213,6 @@ def demo():
except Exception as e:
print("very usefull function", e)
for a in data.corpus:
print(a['date'])
......
# import Celery here
from documents.models import Document
from sources.europresse import Europresse
from sources.isi import Isi
from sources.pubmed import Pubmed
......@@ -9,58 +10,65 @@ import zipfile
def importer(source, language, zip_file, project=None, corpus=None, user=None):
ids = set([ doc.uniqu_id for doc in Document.objects.filter(corpus=corpus)])
if source.database == "Europresse":
try:
print("Europresse DB detected")
c = Europresse()
if zipfile.is_zipfile(zip_file):
with zipfile.ZipFile(zip_file, 'r') as z:
for fichiers in z.namelist():
fichier = z.open(fichiers, 'r')
c.parse(fichier)
c.add(project=project, corpus=corpus, user=user)
c.add(project=project, corpus=corpus, user=user, ids=ids)
except Exception as e:
print(e)
elif source.database == "Isi":
elif source.database == "Web of Science (ISI format)":
try:
print("ISI DB detected")
c = Isi()
if zipfile.is_zipfile(zip_file):
with zipfile.ZipFile(zip_file, 'r') as z:
for fichiers in z.namelist():
print("parsing %s" % (fichiers))
fichier = z.open(fichiers, 'r')
c.parse(fichier, bdd='isi')
c.add(project=project, corpus=corpus, user=user)
c.add(project=project, corpus=corpus, user=user, ids=ids)
except Exception as e:
print(e)
elif source.database == "Ris":
elif source.database == "RIS (Zotero)":
try:
print("RIS DB detected")
c = Isi()
if zipfile.is_zipfile(zip_file):
with zipfile.ZipFile(zip_file, 'r') as z:
for fichiers in z.namelist():
fichier = z.open(fichiers, 'r')
c.parse(fichier)
c.ajouter(project=project, corpus=corpus, user=user)
c.parse(fichier, bdd='ris')
c.add(project=project, corpus=corpus, user=user, ids=ids)
except Exception as e:
print(e)
elif source.database == "Pubmed":
try:
print("PubMed DB detected")
c = Pubmed()
if zipfile.is_zipfile(zip_file):
with zipfile.ZipFile(zip_file, 'r') as z:
for fichiers in z.namelist():
fichier = z.open(fichiers, 'r')
c.parse(fichier)
c.ajouter(project=project, corpus=corpus, user=user)
c.ajouter(project=project, corpus=corpus, user=user, ids=ids)
except Exception as e:
print(e)
else:
pass
print("Corpus not detected")
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
ISI parser.
__author__ : alexandre+gargantext @ delanoe.org
__licence__ : GPL version 3.0+
__DATE__ : 2014
__VERSION__ : 1.0
"""
import os, sys
#reload(sys)
import re
import locale
# import hashlib ?
from datetime import datetime, date
from dateutil import parser
......@@ -22,7 +36,6 @@ class Isi() :
"""
# Specific declarations for Europresse
self.data = []
self.object_ids = []
def read_param(self,file) :
"""
......@@ -35,7 +48,7 @@ class Isi() :
for line in lines:
if line[0] != '#':
tag = line.split('\t')
tags[tag[1]] = [tag[0], tag[2]]
tags[str(tag[1])] = [str(tag[0]), str(tag[2])]
return tags
def rules(self, parameters) :
......@@ -51,12 +64,17 @@ class Isi() :
"""
#source = open(file, 'r')
lines = source.readlines()
document = {}
doc = {}
if bdd == 'isi':
parameters = self.read_param('sources/parameters/isi.init')
try:
print("reading parameters ISI")
parameters = self.read_param('sources/parameters/isi.init')
except Exception as e: print(e)
elif bdd == 'ris':
parameters = self.read_param('sources/parameters/ris.init')
try:
print("reading parameters RIS")
parameters = self.read_param('sources/parameters/ris.init')
except Exception as e: print(e)
for key in list(parameters.keys()):
if parameters[key][0] == 'BEGIN' :
......@@ -68,8 +86,14 @@ class Isi() :
del parameters[end]
for line in lines :
if document == {} and line[:2] == begin :
document['url'] = " "
line = str(line, encoding='UTF-8')
if bdd == 'ris':
line = line.replace(' - ', '')
if doc == {} and line[:2] == begin :
#print(line)
doc['url'] = " "
key = ""
result = ""
......@@ -77,9 +101,9 @@ class Isi() :
if key != "" and key != line[:2]:
try:
document[parameters[key][0]] = result
doc[parameters[key][0]] = result
except Exception as e: print(e)
#document.setdefault(parameters[key][0],[]).append(result)
#doc.setdefault(parameters[key][0],[]).append(result)
key = line[:2]
result = line[2:].strip()
......@@ -89,49 +113,85 @@ class Isi() :
result = result + ' ' + line[2:].strip()#.split(";")
except Exception as error :
pass
print(error)
elif line[:2] == end :
document[parameters[key][0]] = result
doc[parameters[key][0]] = result
try:
try:
date = document['year'] + " " + document['month']
document['date'] = parser.parse(date)
date = doc['year'] + " " + doc['month']
doc['date'] = parser.parse(date)
except:
date = document['year']
document['date'] = datetime.strptime(date, '%Y')
date = doc['year']
doc['date'] = datetime.strptime(date, '%Y')
except Exception as e: print('88', e)
self.data.append(document)
document = {}
except Exception as e:
print('88', e)
try:
print(doc['year'])
except Exception as e: print('58',e)
self.data.append(doc)
doc = {}
def add(self, project=None, corpus=None, user=None):
def add(self, project=None, corpus=None, user=None, ids=None):
""" Appends notices to self.corpus from self.data removing duplicates"""
if ids is not None:
self.object_ids = ids
else:
self.object_ids = set()
for i in self.data:
if 'uniqu_id' not in i.keys():
#crypt = md5.new()
#crypt.update(i['title'])
#i['uniqu_id'] = crypt.digest()
i['uniqu_id'] = i['title'] + i['date']
if i['uniqu_id'] not in self.object_ids and isinstance(i['date'], datetime):
self.object_ids.append(i['uniqu_id'])
self.object_ids.add(i['uniqu_id'])
doc = Document()
doc.project = project
doc.user = user
try:
doc.project = project
except Exception as e: print(e)
try:
doc.user = user
except Exception as e: print(e)
doc.date = i['date']
doc.uniqu_id= i['uniqu_id']
doc.title = i['title']
print(doc.project)
try:
doc.date = i['date']
except Exception as e: print(e)
try:
doc.uniqu_id= i['uniqu_id']
except Exception as e: print(e)
try:
doc.title = i['title']
except Exception as e: print(e)
doc.source = i['source']
doc.authors = i['authors']
doc.text = i['text']
try:
doc.source = i['source']
except Exception as e: print(e)
try:
doc.authors = i['authors']
except Exception as e: print(e)
try:
doc.abstract = i['abstract']
except Exception as e: print(e)
doc.save()
try:
doc.save()
except Exception as e: print(e)
doc.corpus.add(corpus)
self.data = []
def demo():
import sys
data = Isi()
......
......@@ -15,7 +15,7 @@ language LA ""
DT DT ""
keywords DE ;
ID ID ;
text AB
abstract AB
ISIC1 C1 \n
reprint_author RP ,
email EM \n
......@@ -41,5 +41,5 @@ page PG ""
field WC ""
SC SC ""
GA GA ""
object_id UT ""
uniqu_id UT ""
END ER ""
##############################################################################
# LEGEND:
# NAME (what you want[1]) FIELD (see your data) SEPARATORS (see your data)
#
# [1]
# Be careful to these names variables which do not have to change:
# BEGIN, ID-unique, END
##############################################################################
BEGIN TY ""
authors AU \n
AF AF "\n"
title TI ""
source SO "\n"
language LA ""
DT DT ""
keywords KW ;
ID ID ;
abstract AB
text ST ,
ISIC1 C1 \n
reprint_author RP ,
email EM \n
thanks FX
CR CR \n
number NR \n
TC TC ""
Z9 Z9 ""
PU PU ""
PI PI ""
PA PA ""
SN SN ""
journal_small J9 ""
JI JI ""
month PD ""
year PY ""
volume VL ""
IS IS ""
BP BP ""
EP EP ""
DOI DI ""
page PG ""
field WC ""
SC SC ""
GA GA ""
uniqu_id DO ""
END ER ""
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment