Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
gargantext
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
humanities
gargantext
Commits
b67643b0
Commit
b67643b0
authored
Oct 02, 2014
by
Administrator
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
importateur europresse (with previous ids) ok, isi:ok, bud date for ris
parent
9968bfff
Changes
8
Hide whitespace changes
Inline
Side-by-side
Showing
8 changed files
with
378 additions
and
57 deletions
+378
-57
gargantext_web_tutorial-checkpoint.ipynb
.ipynb_checkpoints/gargantext_web_tutorial-checkpoint.ipynb
+157
-7
dependances.deb
dependances.deb
+0
-0
gargantext_web_tutorial.ipynb
gargantext_web_tutorial.ipynb
+56
-1
europresse.py
sources/europresse.py
+8
-6
importateur.py
sources/importateur.py
+16
-8
isi.py
sources/isi.py
+93
-33
isi.init
sources/parameters/isi.init
+2
-2
ris.init
sources/parameters/ris.init
+46
-0
No files found.
.ipynb_checkpoints/gargantext_web_tutorial-checkpoint.ipynb
View file @
b67643b0
{
"metadata": {
"name": "",
"signature": "sha256:
2afae28d08bbb0945aaca44a5b704550048c5dc193cc3d81cb11a551fcc03864
"
"signature": "sha256:
9a933c547abe9d85a6fbe1af7d7d7371d37042e6ad63ac70b7e46acedf7f294f
"
},
"nbformat": 3,
"nbformat_minor": 0,
...
...
@@ -92,6 +92,38 @@
"outputs": [],
"prompt_number": 6
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"import zipfile"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 1
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"with zipfile.ZipFile(\"/tmp/date.zip\", 'r') as f:\n",
" for x in f.namelist():\n",
" print(x)"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
"date.txt\n"
]
}
],
"prompt_number": 12
},
{
"cell_type": "markdown",
"metadata": {},
...
...
@@ -696,32 +728,150 @@
"outputs": [],
"prompt_number": 12
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#REDIS"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [],
"input": [
"BROKER_URL = 'redis://localhost:6379/0'\n",
"# redis://:password@hostname:port/db_number"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 1
2
"prompt_number": 1
},
{
"cell_type": "code",
"collapsed": false,
"input": [],
"input": [
"BROKER_TRANSPORT_OPTIONS = {'visibility_timeout': 3600} # 1 hour."
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number":
1
2
"prompt_number": 2
},
{
"cell_type": "code",
"collapsed": false,
"input": [],
"input": [
"CELERY_RESULT_BACKEND = 'redis://localhost:6379/0'"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 91
"prompt_number": 3
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"BROKER_TRANSPORT_OPTIONS = {'fanout_prefix': True}"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 4
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"BROKER_TRANSPORT_OPTIONS = {'visibility_timeout': 43200}"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 5
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"from __future__ import absolute_import\n",
"\n",
"from celery import Celery\n",
"\n",
"app = Celery('proj',\n",
" broker='redis://localhost:6379/0',\n",
" backend='redis://localhost:6379/0',\n",
" include=['proj.tasks'])\n",
"\n",
"# Optional configuration, see the application user guide.\n",
"app.conf.update(\n",
" CELERY_TASK_RESULT_EXPIRES=3600,\n",
")\n",
"\n",
"if __name__ == '__main__':\n",
" pass#app.start()"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 9
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"from __future__ import absolute_import\n",
"\n",
"\n",
"@app.task\n",
"def add(x, y):\n",
" return x + y\n"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 10
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"app.send_task(add(3, 1000))"
],
"language": "python",
"metadata": {},
"outputs": [
{
"metadata": {},
"output_type": "pyout",
"prompt_number": 15,
"text": [
"<AsyncResult: c4807752-eb28-4e0f-b8b9-fce8267bddd3>"
]
}
],
"prompt_number": 15
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"celery -A proj worker --loglevel=info"
],
"language": "python",
"metadata": {},
"outputs": [
{
"ename": "SyntaxError",
"evalue": "invalid syntax (<ipython-input-16-5806eb0c4fe2>, line 1)",
"output_type": "pyerr",
"traceback": [
"\u001b[1;36m File \u001b[1;32m\"<ipython-input-16-5806eb0c4fe2>\"\u001b[1;36m, line \u001b[1;32m1\u001b[0m\n\u001b[1;33m celery -A proj worker --loglevel=info\u001b[0m\n\u001b[1;37m ^\u001b[0m\n\u001b[1;31mSyntaxError\u001b[0m\u001b[1;31m:\u001b[0m invalid syntax\n"
]
}
],
"prompt_number": 16
},
{
"cell_type": "code",
...
...
dependances.deb
0 → 100644
View file @
b67643b0
gargantext_web_tutorial.ipynb
View file @
b67643b0
{
"metadata": {
"name": "",
"signature": "sha256:
c112732dc666c365db7529d3971cad98ecc2bd84cdea3dc8613b609abf19e262
"
"signature": "sha256:
fd8cdda63e0e9cba7dbdfac864550d69bace0f3f834ebd216402a50bc10992b1
"
},
"nbformat": 3,
"nbformat_minor": 0,
...
...
@@ -148,6 +148,61 @@
],
"prompt_number": 3
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"from lxml import etree"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 5
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"help(etree.parse)"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
"Help on built-in function parse in module lxml.etree:\n",
"\n",
"parse(...)\n",
" parse(source, parser=None, base_url=None)\n",
" \n",
" Return an ElementTree object loaded with source elements. If no parser\n",
" is provided as second argument, the default parser is used.\n",
" \n",
" The ``source`` can be any of the following:\n",
" \n",
" - a file name/path\n",
" - a file object\n",
" - a file-like object\n",
" - a URL using the HTTP or FTP protocol\n",
" \n",
" To parse from a string, use the ``fromstring()`` function instead.\n",
" \n",
" Note that it is generally faster to parse from a file path or URL\n",
" than from an open file object or file-like object. Transparent\n",
" decompression from gzip compressed sources is supported (unless\n",
" explicitly disabled in libxml2).\n",
" \n",
" The ``base_url`` keyword allows setting a URL for the document\n",
" when parsing from a file-like object. This is needed when looking\n",
" up external entities (DTD, XInclude, ...) with relative paths.\n",
"\n"
]
}
],
"prompt_number": 6
},
{
"cell_type": "markdown",
"metadata": {},
...
...
sources/europresse.py
View file @
b67643b0
...
...
@@ -31,7 +31,7 @@ from lxml import etree
from
documents.models
import
Document
#from .corpus import Corpus
class
Europresse
(
Document
):
class
Europresse
():
"""
1) First build tree to parse data
2) Then each notice (article) is nested in a dictionary,
...
...
@@ -45,7 +45,6 @@ class Europresse(Document):
# Specific declarations for Europresse
self
.
data
=
[]
self
.
object_ids
=
[]
# Encoding
self
.
codif
=
"UTF-8"
...
...
@@ -177,11 +176,16 @@ class Europresse(Document):
'authors'
:
""
,
'section'
:
""
,
'page'
:
""
,
'text'
:
""
,
'object_id'
:
""
}
count
+=
1
def
add
(
self
,
project
=
None
,
corpus
=
None
,
user
=
None
):
def
add
(
self
,
project
=
None
,
corpus
=
None
,
user
=
None
,
ids
=
None
):
""" Appends notices to self.corpus from self.data removing duplicates"""
if
ids
is
not
None
:
self
.
object_ids
=
ids
else
:
self
.
object_ids
=
set
()
for
i
in
self
.
data
:
if
i
[
'uniqu_id'
]
not
in
self
.
object_ids
and
isinstance
(
i
[
'date'
],
datetime
):
self
.
object_ids
.
a
ppen
d
(
i
[
'uniqu_id'
])
self
.
object_ids
.
a
d
d
(
i
[
'uniqu_id'
])
doc
=
Document
()
doc
.
project
=
project
...
...
@@ -190,7 +194,6 @@ class Europresse(Document):
doc
.
date
=
i
[
'date'
]
doc
.
uniqu_id
=
i
[
'uniqu_id'
]
doc
.
title
=
i
[
'title'
]
print
(
doc
.
project
)
doc
.
source
=
i
[
'source'
]
doc
.
authors
=
i
[
'authors'
]
...
...
@@ -210,7 +213,6 @@ def demo():
except
Exception
as
e
:
print
(
"very usefull function"
,
e
)
for
a
in
data
.
corpus
:
print
(
a
[
'date'
])
...
...
sources/importateur.py
View file @
b67643b0
# import Celery here
from
documents.models
import
Document
from
sources.europresse
import
Europresse
from
sources.isi
import
Isi
from
sources.pubmed
import
Pubmed
...
...
@@ -9,58 +10,65 @@ import zipfile
def
importer
(
source
,
language
,
zip_file
,
project
=
None
,
corpus
=
None
,
user
=
None
):
ids
=
set
([
doc
.
uniqu_id
for
doc
in
Document
.
objects
.
filter
(
corpus
=
corpus
)])
if
source
.
database
==
"Europresse"
:
try
:
print
(
"Europresse DB detected"
)
c
=
Europresse
()
if
zipfile
.
is_zipfile
(
zip_file
):
with
zipfile
.
ZipFile
(
zip_file
,
'r'
)
as
z
:
for
fichiers
in
z
.
namelist
():
fichier
=
z
.
open
(
fichiers
,
'r'
)
c
.
parse
(
fichier
)
c
.
add
(
project
=
project
,
corpus
=
corpus
,
user
=
user
)
c
.
add
(
project
=
project
,
corpus
=
corpus
,
user
=
user
,
ids
=
ids
)
except
Exception
as
e
:
print
(
e
)
elif
source
.
database
==
"
Isi
"
:
elif
source
.
database
==
"
Web of Science (ISI format)
"
:
try
:
print
(
"ISI DB detected"
)
c
=
Isi
()
if
zipfile
.
is_zipfile
(
zip_file
):
with
zipfile
.
ZipFile
(
zip_file
,
'r'
)
as
z
:
for
fichiers
in
z
.
namelist
():
print
(
"parsing
%
s"
%
(
fichiers
))
fichier
=
z
.
open
(
fichiers
,
'r'
)
c
.
parse
(
fichier
,
bdd
=
'isi'
)
c
.
add
(
project
=
project
,
corpus
=
corpus
,
user
=
user
)
c
.
add
(
project
=
project
,
corpus
=
corpus
,
user
=
user
,
ids
=
ids
)
except
Exception
as
e
:
print
(
e
)
elif
source
.
database
==
"R
is
"
:
elif
source
.
database
==
"R
IS (Zotero)
"
:
try
:
print
(
"RIS DB detected"
)
c
=
Isi
()
if
zipfile
.
is_zipfile
(
zip_file
):
with
zipfile
.
ZipFile
(
zip_file
,
'r'
)
as
z
:
for
fichiers
in
z
.
namelist
():
fichier
=
z
.
open
(
fichiers
,
'r'
)
c
.
parse
(
fichier
)
c
.
a
jouter
(
project
=
project
,
corpus
=
corpus
,
user
=
user
)
c
.
parse
(
fichier
,
bdd
=
'ris'
)
c
.
a
dd
(
project
=
project
,
corpus
=
corpus
,
user
=
user
,
ids
=
ids
)
except
Exception
as
e
:
print
(
e
)
elif
source
.
database
==
"Pubmed"
:
try
:
print
(
"PubMed DB detected"
)
c
=
Pubmed
()
if
zipfile
.
is_zipfile
(
zip_file
):
with
zipfile
.
ZipFile
(
zip_file
,
'r'
)
as
z
:
for
fichiers
in
z
.
namelist
():
fichier
=
z
.
open
(
fichiers
,
'r'
)
c
.
parse
(
fichier
)
c
.
ajouter
(
project
=
project
,
corpus
=
corpus
,
user
=
user
)
c
.
ajouter
(
project
=
project
,
corpus
=
corpus
,
user
=
user
,
ids
=
ids
)
except
Exception
as
e
:
print
(
e
)
else
:
p
ass
p
rint
(
"Corpus not detected"
)
sources/isi.py
View file @
b67643b0
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
ISI parser.
__author__ : alexandre+gargantext @ delanoe.org
__licence__ : GPL version 3.0+
__DATE__ : 2014
__VERSION__ : 1.0
"""
import
os
,
sys
#reload(sys)
import
re
import
locale
# import hashlib ?
from
datetime
import
datetime
,
date
from
dateutil
import
parser
...
...
@@ -22,7 +36,6 @@ class Isi() :
"""
# Specific declarations for Europresse
self
.
data
=
[]
self
.
object_ids
=
[]
def
read_param
(
self
,
file
)
:
"""
...
...
@@ -35,7 +48,7 @@ class Isi() :
for
line
in
lines
:
if
line
[
0
]
!=
'#'
:
tag
=
line
.
split
(
'
\t
'
)
tags
[
tag
[
1
]]
=
[
tag
[
0
],
tag
[
2
]
]
tags
[
str
(
tag
[
1
])]
=
[
str
(
tag
[
0
]),
str
(
tag
[
2
])
]
return
tags
def
rules
(
self
,
parameters
)
:
...
...
@@ -51,12 +64,17 @@ class Isi() :
"""
#source = open(file, 'r')
lines
=
source
.
readlines
()
doc
ument
=
{}
doc
=
{}
if
bdd
==
'isi'
:
parameters
=
self
.
read_param
(
'sources/parameters/isi.init'
)
try
:
print
(
"reading parameters ISI"
)
parameters
=
self
.
read_param
(
'sources/parameters/isi.init'
)
except
Exception
as
e
:
print
(
e
)
elif
bdd
==
'ris'
:
parameters
=
self
.
read_param
(
'sources/parameters/ris.init'
)
try
:
print
(
"reading parameters RIS"
)
parameters
=
self
.
read_param
(
'sources/parameters/ris.init'
)
except
Exception
as
e
:
print
(
e
)
for
key
in
list
(
parameters
.
keys
()):
if
parameters
[
key
][
0
]
==
'BEGIN'
:
...
...
@@ -68,8 +86,14 @@ class Isi() :
del
parameters
[
end
]
for
line
in
lines
:
if
document
==
{}
and
line
[:
2
]
==
begin
:
document
[
'url'
]
=
" "
line
=
str
(
line
,
encoding
=
'UTF-8'
)
if
bdd
==
'ris'
:
line
=
line
.
replace
(
' - '
,
''
)
if
doc
==
{}
and
line
[:
2
]
==
begin
:
#print(line)
doc
[
'url'
]
=
" "
key
=
""
result
=
""
...
...
@@ -77,9 +101,9 @@ class Isi() :
if
key
!=
""
and
key
!=
line
[:
2
]:
try
:
doc
ument
[
parameters
[
key
][
0
]]
=
result
doc
[
parameters
[
key
][
0
]]
=
result
except
Exception
as
e
:
print
(
e
)
#doc
ument
.setdefault(parameters[key][0],[]).append(result)
#doc.setdefault(parameters[key][0],[]).append(result)
key
=
line
[:
2
]
result
=
line
[
2
:]
.
strip
()
...
...
@@ -89,49 +113,85 @@ class Isi() :
result
=
result
+
' '
+
line
[
2
:]
.
strip
()
#.split(";")
except
Exception
as
error
:
p
ass
p
rint
(
error
)
elif
line
[:
2
]
==
end
:
document
[
parameters
[
key
][
0
]]
=
result
doc
[
parameters
[
key
][
0
]]
=
result
try
:
try
:
date
=
doc
ument
[
'year'
]
+
" "
+
document
[
'month'
]
doc
ument
[
'date'
]
=
parser
.
parse
(
date
)
date
=
doc
[
'year'
]
+
" "
+
doc
[
'month'
]
doc
[
'date'
]
=
parser
.
parse
(
date
)
except
:
date
=
doc
ument
[
'year'
]
doc
ument
[
'date'
]
=
datetime
.
strptime
(
date
,
'
%
Y'
)
date
=
doc
[
'year'
]
doc
[
'date'
]
=
datetime
.
strptime
(
date
,
'
%
Y'
)
except
Exception
as
e
:
print
(
'88'
,
e
)
self
.
data
.
append
(
document
)
document
=
{}
except
Exception
as
e
:
print
(
'88'
,
e
)
try
:
print
(
doc
[
'year'
])
except
Exception
as
e
:
print
(
'58'
,
e
)
self
.
data
.
append
(
doc
)
doc
=
{}
def
add
(
self
,
project
=
None
,
corpus
=
None
,
user
=
None
):
def
add
(
self
,
project
=
None
,
corpus
=
None
,
user
=
None
,
ids
=
None
):
""" Appends notices to self.corpus from self.data removing duplicates"""
if
ids
is
not
None
:
self
.
object_ids
=
ids
else
:
self
.
object_ids
=
set
()
for
i
in
self
.
data
:
if
'uniqu_id'
not
in
i
.
keys
():
#crypt = md5.new()
#crypt.update(i['title'])
#i['uniqu_id'] = crypt.digest()
i
[
'uniqu_id'
]
=
i
[
'title'
]
+
i
[
'date'
]
if
i
[
'uniqu_id'
]
not
in
self
.
object_ids
and
isinstance
(
i
[
'date'
],
datetime
):
self
.
object_ids
.
a
ppen
d
(
i
[
'uniqu_id'
])
self
.
object_ids
.
a
d
d
(
i
[
'uniqu_id'
])
doc
=
Document
()
doc
.
project
=
project
doc
.
user
=
user
try
:
doc
.
project
=
project
except
Exception
as
e
:
print
(
e
)
try
:
doc
.
user
=
user
except
Exception
as
e
:
print
(
e
)
doc
.
date
=
i
[
'date'
]
doc
.
uniqu_id
=
i
[
'uniqu_id'
]
doc
.
title
=
i
[
'title'
]
print
(
doc
.
project
)
try
:
doc
.
date
=
i
[
'date'
]
except
Exception
as
e
:
print
(
e
)
try
:
doc
.
uniqu_id
=
i
[
'uniqu_id'
]
except
Exception
as
e
:
print
(
e
)
try
:
doc
.
title
=
i
[
'title'
]
except
Exception
as
e
:
print
(
e
)
doc
.
source
=
i
[
'source'
]
doc
.
authors
=
i
[
'authors'
]
doc
.
text
=
i
[
'text'
]
try
:
doc
.
source
=
i
[
'source'
]
except
Exception
as
e
:
print
(
e
)
try
:
doc
.
authors
=
i
[
'authors'
]
except
Exception
as
e
:
print
(
e
)
try
:
doc
.
abstract
=
i
[
'abstract'
]
except
Exception
as
e
:
print
(
e
)
doc
.
save
()
try
:
doc
.
save
()
except
Exception
as
e
:
print
(
e
)
doc
.
corpus
.
add
(
corpus
)
self
.
data
=
[]
def
demo
():
import
sys
data
=
Isi
()
...
...
sources/parameters/isi.init
View file @
b67643b0
...
...
@@ -15,7 +15,7 @@ language LA ""
DT DT ""
keywords DE ;
ID ID ;
tex
t AB
abstrac
t AB
ISIC1 C1 \n
reprint_author RP ,
email EM \n
...
...
@@ -41,5 +41,5 @@ page PG ""
field WC ""
SC SC ""
GA GA ""
object
_id UT ""
uniqu
_id UT ""
END ER ""
sources/parameters/ris.init
0 → 100644
View file @
b67643b0
##############################################################################
# LEGEND:
# NAME (what you want[1]) FIELD (see your data) SEPARATORS (see your data)
#
# [1]
# Be careful to these names variables which do not have to change:
# BEGIN, ID-unique, END
##############################################################################
BEGIN TY ""
authors AU \n
AF AF "\n"
title TI ""
source SO "\n"
language LA ""
DT DT ""
keywords KW ;
ID ID ;
abstract AB
text ST ,
ISIC1 C1 \n
reprint_author RP ,
email EM \n
thanks FX
CR CR \n
number NR \n
TC TC ""
Z9 Z9 ""
PU PU ""
PI PI ""
PA PA ""
SN SN ""
journal_small J9 ""
JI JI ""
month PD ""
year PY ""
volume VL ""
IS IS ""
BP BP ""
EP EP ""
DOI DI ""
page PG ""
field WC ""
SC SC ""
GA GA ""
uniqu_id DO ""
END ER ""
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment