Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
gargantext
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
humanities
gargantext
Commits
d37bdbd3
Commit
d37bdbd3
authored
May 12, 2015
by
Administrator
Browse files
Options
Browse Files
Download
Plain Diff
Merge branch 'testing' into prod-dev
parents
a1b68438
407b96ab
Changes
7
Hide whitespace changes
Inline
Side-by-side
Showing
7 changed files
with
97 additions
and
63 deletions
+97
-63
EuropressFileParser.py
parsing/FileParsers/EuropressFileParser.py
+36
-36
FileParser.py
parsing/FileParsers/FileParser.py
+14
-12
RisFileParser.py
parsing/FileParsers/RisFileParser.py
+12
-4
ZoteroFileParser.py
parsing/FileParsers/ZoteroFileParser.py
+23
-0
__init__.py
parsing/FileParsers/__init__.py
+1
-0
corpustools.py
parsing/corpustools.py
+9
-9
parsers_config.py
parsing/parsers_config.py
+2
-2
No files found.
parsing/FileParsers/EuropressFileParser.py
View file @
d37bdbd3
...
...
@@ -11,7 +11,7 @@ from ..NgramsExtractors import *
from
admin.utils
import
PrintException
class
EuropressFileParser
(
FileParser
):
def
_parse
(
self
,
file
):
localeEncoding
=
"fr_FR"
...
...
@@ -39,21 +39,21 @@ class EuropressFileParser(FileParser):
try
:
html_parser
=
etree
.
HTMLParser
(
encoding
=
codif
)
html
=
etree
.
fromstring
(
contents
,
html_parser
)
try
:
format_europresse
=
50
html_articles
=
html
.
xpath
(
'/html/body/table/tbody'
)
if
len
(
html_articles
)
<
1
:
html_articles
=
html
.
xpath
(
'/html/body/table'
)
if
len
(
html_articles
)
<
1
:
format_europresse
=
1
html_articles
=
html
.
xpath
(
'//div[@id="docContain"]'
)
except
:
PrintException
()
if
format_europresse
==
50
:
name_xpath
=
"./tr/td/span[@class = 'DocPublicationName']"
header_xpath
=
"./tr/td/span[@class = 'DocHeader']"
...
...
@@ -77,7 +77,7 @@ class EuropressFileParser(FileParser):
or self::td[@class='txtCertificat']
\
)]/text()"
doi_xpath
=
"//span[@id='ucPubliC_lblNodoc']/text()"
except
Exception
as
error
:
PrintException
()
...
...
@@ -85,9 +85,9 @@ class EuropressFileParser(FileParser):
# parse all the articles, one by one
try
:
for
html_article
in
html_articles
:
hyperdata
=
{}
if
len
(
html_article
):
for
name
in
html_article
.
xpath
(
name_xpath
):
if
name
.
text
is
not
None
:
...
...
@@ -98,26 +98,26 @@ class EuropressFileParser(FileParser):
hyperdata
[
'volume'
]
=
test_journal
.
group
(
2
)
else
:
hyperdata
[
'journal'
]
=
name
.
text
.
encode
(
codif
)
countbis
=
0
for
header
in
html_article
.
xpath
(
header_xpath
):
# print(count)
# countbis += 1
# try:
# print('109', hyperdata['publication_date'])
# except:
# print('no date yet')
# pass
try
:
text
=
header
.
text
#print("header", text)
except
Exception
as
error
:
print
(
error
)
if
isinstance
(
text
,
bytes
):
text
=
text
.
decode
(
encoding
)
format_date_fr
=
re
.
compile
(
'
\
d*
\
s*
\
w+
\
s+
\
d{4}'
,
re
.
UNICODE
)
...
...
@@ -134,9 +134,9 @@ class EuropressFileParser(FileParser):
test_date_en
=
None
test_sect
=
None
test_page
=
None
if
test_date_fr
is
not
None
:
self
.
localeEncoding
=
"fr_FR"
locale
.
setlocale
(
locale
.
LC_ALL
,
localeEncoding
)
...
...
@@ -158,7 +158,7 @@ class EuropressFileParser(FileParser):
except
Exception
as
error
:
print
(
error
,
text
)
pass
if
test_date_en
is
not
None
:
localeEncoding
=
"en_GB.UTF-8"
...
...
@@ -173,20 +173,20 @@ class EuropressFileParser(FileParser):
if
test_sect
is
not
None
:
hyperdata
[
'section'
]
=
test_sect
.
group
(
1
)
.
encode
(
codif
)
if
test_page
is
not
None
:
hyperdata
[
'page'
]
=
test_page
.
group
(
1
)
.
encode
(
codif
)
try
:
print
(
'183'
,
hyperdata
[
'publication_date'
])
except
:
print
(
'no date yet'
)
pass
#
try:
#
print('183', hyperdata['publication_date'])
#
except:
#
print('no date yet')
#
pass
#
hyperdata
[
'title'
]
=
html_article
.
xpath
(
title_xpath
)
.
encode
(
codif
)
hyperdata
[
'abstract'
]
=
html_article
.
xpath
(
text_xpath
)
line
=
0
br_tag
=
10
for
i
in
html_articles
[
count
]
.
iter
():
...
...
@@ -205,13 +205,13 @@ class EuropressFileParser(FileParser):
hyperdata
[
'authors'
]
=
'not found'
line
=
0
br_tag
=
10
try
:
if
hyperdata
[
'publication_date'
]
is
not
None
or
hyperdata
[
'publication_date'
]
!=
''
:
try
:
back
=
hyperdata
[
'publication_date'
]
except
Exception
as
e
:
except
Exception
as
e
:
#print(e)
pass
else
:
...
...
@@ -226,14 +226,14 @@ class EuropressFileParser(FileParser):
#hyperdata['language_iso2'] = 'fr'
#elif lang == 'en':
# hyperdata['language_iso2'] = 'en'
hyperdata
[
'publication_year'
]
=
hyperdata
[
'publication_date'
]
.
strftime
(
'
%
Y'
)
hyperdata
[
'publication_month'
]
=
hyperdata
[
'publication_date'
]
.
strftime
(
'
%
m'
)
hyperdata
[
'publication_day'
]
=
hyperdata
[
'publication_date'
]
.
strftime
(
'
%
d'
)
#hyperdata.pop('publication_date')
if
len
(
hyperdata
[
'abstract'
])
>
0
and
format_europresse
==
50
:
if
len
(
hyperdata
[
'abstract'
])
>
0
and
format_europresse
==
50
:
hyperdata
[
'doi'
]
=
str
(
hyperdata
[
'abstract'
][
-
9
])
hyperdata
[
'abstract'
]
.
pop
()
# Here add separator for paragraphs
...
...
@@ -245,15 +245,15 @@ class EuropressFileParser(FileParser):
# Here add separator for paragraphs
hyperdata
[
'abstract'
]
=
str
(
' '
.
join
(
hyperdata
[
'abstract'
]))
else
:
else
:
hyperdata
[
'doi'
]
=
"not found"
hyperdata
[
'length_words'
]
=
len
(
hyperdata
[
'abstract'
]
.
split
(
' '
))
hyperdata
[
'length_letters'
]
=
len
(
hyperdata
[
'abstract'
])
hyperdata
[
'bdd'
]
=
u'europresse'
hyperdata
[
'url'
]
=
u''
#hyperdata_str = {}
for
key
,
value
in
hyperdata
.
items
():
hyperdata
[
key
]
=
value
.
decode
()
if
isinstance
(
value
,
bytes
)
else
value
...
...
parsing/FileParsers/FileParser.py
View file @
d37bdbd3
...
...
@@ -4,21 +4,21 @@ import zipfile
import
chardet
from
..Caches
import
LanguagesCache
class
FileParser
:
"""Base class for performing files parsing depending on their type.
"""
def
__init__
(
self
,
language_cache
=
None
):
self
.
_languages_cache
=
LanguagesCache
()
if
language_cache
is
None
else
language_cache
def
detect_encoding
(
self
,
string
):
"""Useful method to detect the document encoding.
"""
encoding
=
chardet
.
detect
(
string
)
return
encoding
.
get
(
'encoding'
,
'UTF-8'
)
def
format_hyperdata_dates
(
self
,
hyperdata
):
"""Format the dates found in the hyperdata.
Examples:
...
...
@@ -27,7 +27,7 @@ class FileParser:
{"publication_year": "2014"}
-> {"publication_date": "2014-01-01 00:00:00", "publication_year": "2014", ...}
"""
# First, check the split dates...
prefixes
=
[
key
[:
-
5
]
for
key
in
hyperdata
.
keys
()
if
key
[
-
5
:]
==
"_year"
]
for
prefix
in
prefixes
:
...
...
@@ -51,21 +51,23 @@ class FileParser:
hyperdata
[
prefix
+
"_date"
]
=
dateutil
.
parser
.
parse
(
date_string
)
.
strftime
(
"
%
Y-
%
m-
%
d
%
H:
%
M:
%
S"
)
except
:
pass
# ...then parse all the "date" fields, to parse it into separate elements
prefixes
=
[
key
[:
-
5
]
for
key
in
hyperdata
.
keys
()
if
key
[
-
5
:]
==
"_date"
]
for
prefix
in
prefixes
:
date
=
dateutil
.
parser
.
parse
(
hyperdata
[
prefix
+
"_date"
])
print
(
'date'
)
hyperdata
[
prefix
+
"_year"
]
=
date
.
strftime
(
"
%
Y"
)
hyperdata
[
prefix
+
"_month"
]
=
date
.
strftime
(
"
%
m"
)
hyperdata
[
prefix
+
"_day"
]
=
date
.
strftime
(
"
%
d"
)
hyperdata
[
prefix
+
"_hour"
]
=
date
.
strftime
(
"
%
H"
)
hyperdata
[
prefix
+
"_minute"
]
=
date
.
strftime
(
"
%
M"
)
hyperdata
[
prefix
+
"_second"
]
=
date
.
strftime
(
"
%
S"
)
# finally, return the transformed result!
return
hyperdata
def
format_hyperdata_languages
(
self
,
hyperdata
):
"""format the languages found in the hyperdata."""
language
=
None
...
...
@@ -81,18 +83,18 @@ class FileParser:
hyperdata
[
"language_iso3"
]
=
language
.
iso3
hyperdata
[
"language_fullname"
]
=
language
.
fullname
return
hyperdata
def
format_hyperdata
(
self
,
hyperdata
):
"""Format the hyperdata."""
hyperdata
=
self
.
format_hyperdata_dates
(
hyperdata
)
hyperdata
=
self
.
format_hyperdata_languages
(
hyperdata
)
return
hyperdata
def
_parse
(
self
,
file
):
"""This method shall be overriden by inherited classes."""
return
list
()
def
parse
(
self
,
file
):
"""Parse the file, and its children files found in the file.
"""
...
...
parsing/FileParsers/RisFileParser.py
View file @
d37bdbd3
...
...
@@ -3,15 +3,17 @@ from .FileParser import FileParser
from
..Caches
import
LanguagesCache
from
admin.utils
import
PrintException
class
RisFileParser
(
FileParser
):
def
__init__
(
self
,
language_cache
=
None
):
super
(
FileParser
,
self
)
.
__init__
()
self
.
_languages_cache
=
LanguagesCache
()
if
language_cache
is
None
else
language_cache
self
.
_begin
=
6
self
.
_parameters
=
{
b
"ER"
:
{
"type"
:
"delimiter"
},
b
"TI"
:
{
"type"
:
"hyperdata"
,
"key"
:
"title"
,
"separator"
:
" "
},
...
...
@@ -24,7 +26,7 @@ class RisFileParser(FileParser):
b
"AB"
:
{
"type"
:
"hyperdata"
,
"key"
:
"abstract"
,
"separator"
:
" "
},
b
"WC"
:
{
"type"
:
"hyperdata"
,
"key"
:
"fields"
},
}
def
_parse
(
self
,
file
):
hyperdata
=
{}
...
...
@@ -57,5 +59,11 @@ class RisFileParser(FileParser):
print
(
error
)
# if a hyperdata object is left in memory, yield it as well
if
hyperdata
:
# try:
# if hyperdata['date_to_parse']:
# print(hyperdata['date_to_parse'])
# except:
# pass
#
#print(hyperdata['title'])
yield
hyperdata
parsing/FileParsers/ZoteroFileParser.py
0 → 100644
View file @
d37bdbd3
from
.RisFileParser
import
RisFileParser
from
..Caches
import
LanguagesCache
class
ZoteroFileParser
(
RisFileParser
):
def
__init__
(
self
):
super
(
RisFileParser
,
self
)
.
__init__
()
self
.
_begin
=
6
self
.
_parameters
=
{
b
"ER"
:
{
"type"
:
"delimiter"
},
b
"TI"
:
{
"type"
:
"hyperdata"
,
"key"
:
"title"
,
"separator"
:
" "
},
b
"AU"
:
{
"type"
:
"hyperdata"
,
"key"
:
"authors"
,
"separator"
:
", "
},
b
"UR"
:
{
"type"
:
"hyperdata"
,
"key"
:
"doi"
},
b
"DA"
:
{
"type"
:
"hyperdata"
,
"key"
:
"publication_date"
},
b
"PY"
:
{
"type"
:
"hyperdata"
,
"key"
:
"publication_year"
},
b
"PD"
:
{
"type"
:
"hyperdata"
,
"key"
:
"publication_month"
},
b
"LA"
:
{
"type"
:
"hyperdata"
,
"key"
:
"language_iso2"
},
b
"AB"
:
{
"type"
:
"hyperdata"
,
"key"
:
"abstract"
,
"separator"
:
" "
},
b
"WC"
:
{
"type"
:
"hyperdata"
,
"key"
:
"fields"
},
}
parsing/FileParsers/__init__.py
View file @
d37bdbd3
from
.RisFileParser
import
RisFileParser
from
.IsiFileParser
import
IsiFileParser
from
.JstorFileParser
import
JstorFileParser
from
.ZoteroFileParser
import
ZoteroFileParser
from
.PubmedFileParser
import
PubmedFileParser
from
.EuropressFileParser
import
EuropressFileParser
from
.ISText
import
ISText
parsing/corpustools.py
View file @
d37bdbd3
...
...
@@ -128,7 +128,7 @@ def parse_resources(corpus, user=None, user_id=None):
nodes
.
append
(
node
)
#
# TODO: mark node-resources associations as parsed
#
#
dbg
.
show
(
'insert
%
d documents'
%
len
(
nodes
))
session
.
add_all
(
nodes
)
session
.
commit
()
...
...
@@ -205,7 +205,7 @@ def extract_ngrams(corpus, keys):
language
.
id
:
language
.
iso2
for
language
in
session
.
query
(
Language
)
}
ngrams_data
=
set
()
ngrams_language_data
=
set
()
ngrams_tag_data
=
set
()
...
...
@@ -241,7 +241,7 @@ def extract_ngrams(corpus, keys):
#tag_id = 14
#print('tag_id_2', tag_id)
node_ngram_list
[
node_id
][
terms
]
+=
1
ngrams_data
.
add
((
n
,
terms
))
ngrams_data
.
add
((
n
,
terms
[:
255
]
))
ngrams_language_data
.
add
((
terms
,
language_id
))
ngrams_tag_data
.
add
((
terms
,
tag_id
))
...
...
@@ -268,7 +268,7 @@ def extract_ngrams(corpus, keys):
ngram.terms = tmp__ngrams.terms
'''
%
(
Ngram
.
__table__
.
name
,
))
# insert, then get the ids back
cursor
.
execute
(
'''
INSERT INTO
%
s (n, terms)
...
...
@@ -279,8 +279,8 @@ def extract_ngrams(corpus, keys):
WHERE
id IS NULL
'''
%
(
Ngram
.
__table__
.
name
,
))
cursor
.
execute
(
'''
UPDATE
tmp__ngrams
...
...
@@ -293,14 +293,14 @@ def extract_ngrams(corpus, keys):
AND
tmp__ngrams.id IS NULL
'''
%
(
Ngram
.
__table__
.
name
,
))
# get all ids
ngram_ids
=
dict
()
cursor
.
execute
(
'SELECT id, terms FROM tmp__ngrams'
)
for
row
in
cursor
.
fetchall
():
ngram_ids
[
row
[
1
]]
=
row
[
0
]
#
#
dbg
.
show
(
'insert associations'
)
node_ngram_data
=
list
()
for
node_id
,
ngrams
in
node_ngram_list
.
items
():
...
...
parsing/parsers_config.py
View file @
d37bdbd3
...
...
@@ -4,11 +4,11 @@ parsers = {
'Pubmed (xml format)'
:
PubmedFileParser
,
'Web of Science (ISI format)'
:
IsiFileParser
,
'Scopus (RIS format)'
:
RisFileParser
,
'Zotero (RIS format)'
:
Jstor
FileParser
,
'Zotero (RIS format)'
:
Zotero
FileParser
,
'Jstor (RIS format)'
:
JstorFileParser
,
#'Europress' : EuropressFileParser,
'Europress (French)'
:
EuropressFileParser
,
'Europress (English)'
:
EuropressFileParser
,
}
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment