Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
gargantext
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
humanities
gargantext
Commits
ba026add
Commit
ba026add
authored
Dec 08, 2015
by
Romain Loth
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
Un seul EuropressFileParser pour les deux langues.
parent
5149e7ce
Changes
6
Expand all
Hide whitespace changes
Inline
Side-by-side
Showing
6 changed files
with
158 additions
and
427 deletions
+158
-427
views_optimized.py
gargantext_web/views_optimized.py
+2
-9
EuropressFileParser.py
parsing/FileParsers/EuropressFileParser.py
+144
-82
EuropressFileParser_en.py
parsing/FileParsers/EuropressFileParser_en.py
+0
-165
EuropressFileParser_fr.py
parsing/FileParsers/EuropressFileParser_fr.py
+0
-167
__init__.py
parsing/FileParsers/__init__.py
+2
-2
parsers_config.py
parsing/parsers_config.py
+10
-2
No files found.
gargantext_web/views_optimized.py
View file @
ba026add
...
...
@@ -125,21 +125,14 @@ def project(request, project_id):
thefile
=
form
.
cleaned_data
[
'file'
]
resourcetype
=
cache
.
ResourceType
[
form
.
cleaned_data
[
'type'
]]
# which default language shall be used?
if
resourcetype
.
name
==
"Europress (French)"
:
language_id
=
cache
.
Language
[
'fr'
]
.
id
elif
resourcetype
.
name
==
"Europress (English)"
:
language_id
=
cache
.
Language
[
'en'
]
.
id
else
:
language_id
=
None
# corpus node instanciation as a Django model
corpus
=
Node
(
name
=
name
,
user_id
=
request
.
user
.
id
,
parent_id
=
project_id
,
type_id
=
cache
.
NodeType
[
'Corpus'
]
.
id
,
language_id
=
language_id
,
# no default language at this point
language_id
=
None
,
hyperdata
=
{
'Processing'
:
"Parsing documents"
,}
)
session
.
add
(
corpus
)
...
...
parsing/FileParsers/EuropressFileParser.py
View file @
ba026add
This diff is collapsed.
Click to expand it.
parsing/FileParsers/EuropressFileParser_en.py
deleted
100644 → 0
View file @
5149e7ce
import
re
import
locale
from
lxml
import
etree
from
lxml.etree
import
tostring
from
lxml.html
import
html5parser
from
itertools
import
chain
from
datetime
import
datetime
,
date
from
django.utils
import
timezone
import
dateutil.parser
import
dateparser
import
sys
#sys.path.append('/srv/gargantext')
#from admin.env import *
#from parsing.FileParsers.FileParser import FileParser
from
.FileParser
import
FileParser
#from parsing.NgramsExtractors import *
from
..NgramsExtractors
import
*
from
admin.utils
import
PrintException
class
EuropressFileParser_en
(
FileParser
):
def
_parse
(
self
,
file
):
localeEncoding
=
"fr_FR"
codif
=
"UTF-8"
format_page
=
re
.
compile
(
'p
\
. .*'
,
re
.
UNICODE
)
def
parse_date
(
date
,
lang
):
d
=
dateparser
.
parse
(
date
.
strip
(),
languages
=
[
lang
])
return
d
if
isinstance
(
file
,
str
):
file
=
open
(
file
,
'rb'
)
contents
=
file
.
read
()
encoding
=
self
.
detect_encoding
(
contents
)
if
encoding
!=
"utf-8"
:
try
:
contents
=
contents
.
decode
(
"latin1"
,
errors
=
'replace'
)
.
encode
(
codif
)
except
:
PrintException
()
html_parser
=
etree
.
HTMLParser
(
encoding
=
codif
)
html
=
etree
.
fromstring
(
contents
,
html_parser
)
html_parser
=
html5parser
.
etree
.
HTMLParser
(
encoding
=
codif
)
html
=
html5parser
.
etree
.
fromstring
(
contents
,
html_parser
)
html_articles
=
html
.
xpath
(
'//article'
)
name_xpath
=
"./header/div/span[@class = 'DocPublicationName']"
header_xpath
=
"./header/div/span[@class = 'DocHeader']"
title_xpath
=
"./header/div[@class='titreArticle']"
text_xpath
=
"./section/div[@class='DocText']//p"
def
scrap_text
(
data_xpath
):
"""
Récupère le texte de toute arborescence
sous une liste de noeuds (par ex liste de <p>)
et renvoie une liste de string
"""
result
=
list
()
# a priori un seul titre ou plusieurs p dans data_xpath
for
elem
in
data_xpath
:
all_text
=
list
()
# on utilise itertext pour avoir
# tous les sous éléments 1 fois
# quelque soit la profondeur
for
sub_txt
in
elem
.
itertext
(
with_tail
=
True
):
sub_txt_clean
=
sub_txt
.
strip
()
if
sub_txt_clean
!=
''
:
all_text
.
append
(
sub_txt_clean
)
result
.
append
(
" "
.
join
(
all_text
))
return
result
# parse all the articles, one by one
try
:
for
html_article
in
html_articles
:
hyperdata
=
{}
try
:
pub_name
=
html_article
.
xpath
(
name_xpath
)[
0
]
.
text
name
=
pub_name
.
split
(
', '
)
hyperdata
[
'journal'
]
=
name
[
0
]
hyperdata
[
'number'
]
=
name
[
1
]
except
:
try
:
hyperdata
[
'journal'
]
=
pub_name
.
strip
()
except
:
pass
header
=
html_article
.
xpath
(
header_xpath
)[
0
]
.
text
if
header
is
not
None
:
# attention en anglais la date contient 1 ou 2 virgules
# ex: "Tuesday, November 7, 2012"
# ==> dans tous ces cas 'en' dateparser.parse
# sera lancé sur header[i:] et non header[i]
header
=
header
.
split
(
', '
)
header
=
list
(
filter
(
lambda
x
:
format_page
.
match
(
x
)
is
None
,
header
))
if
parse_date
(
header
[
0
],
'en'
)
is
not
None
:
date
=
' '
.
join
(
header
[
0
:])
elif
parse_date
(
header
[
1
],
'en'
)
is
not
None
:
hyperdata
[
'rubrique'
]
=
header
[
0
]
date
=
' '
.
join
(
header
[
1
:])
elif
parse_date
(
header
[
2
],
'en'
)
is
not
None
:
hyperdata
[
'rubrique'
]
=
header
[
0
]
date
=
' '
.
join
(
header
[
2
:])
elif
parse_date
(
header
[
3
],
'en'
)
is
not
None
:
hyperdata
[
'rubrique'
]
=
header
[
0
]
date
=
' '
.
join
(
header
[
3
:])
else
:
date
=
'2016'
try
:
hyperdata
[
'publication_date'
]
=
dateparser
.
parse
(
date
.
strip
(),
languages
=
[
'en'
])
except
:
hyperdata
[
'publication_date'
]
=
timezone
.
now
()
.
strftime
(
"
%
Y-
%
m-
%
d
%
H:
%
M:
%
S"
)
try
:
hyperdata
[
'publication_year'
]
=
hyperdata
[
'publication_date'
]
.
strftime
(
'
%
Y'
)
hyperdata
[
'publication_month'
]
=
hyperdata
[
'publication_date'
]
.
strftime
(
'
%
m'
)
hyperdata
[
'publication_day'
]
=
hyperdata
[
'publication_date'
]
.
strftime
(
'
%
d'
)
except
:
print
(
hyperdata
[
'title'
])
print
(
date
)
try
:
title
=
scrap_text
(
html_article
.
xpath
(
title_xpath
))
hyperdata
[
'title'
]
=
title
[
0
]
except
:
pass
try
:
text
=
scrap_text
(
html_article
.
xpath
(
text_xpath
))
hyperdata
[
'abstract'
]
=
'
\n
'
.
join
([
'<p>
\n
'
+
p_text
+
'</p>
\n
'
for
p_text
in
title
[
1
:]
+
text
])
except
:
pass
yield
hyperdata
except
:
PrintException
()
pass
if
__name__
==
"__main__"
:
e
=
EuropressFileParser
()
hyperdata
=
e
.
parse
(
str
(
sys
.
argv
[
1
]))
for
h
in
hyperdata
:
try
:
print
(
h
[
'journal'
],
":"
,
h
[
'publication_date'
])
except
:
pass
parsing/FileParsers/EuropressFileParser_fr.py
deleted
100644 → 0
View file @
5149e7ce
import
re
import
locale
from
lxml
import
etree
from
lxml.etree
import
tostring
from
lxml.html
import
html5parser
from
itertools
import
chain
from
datetime
import
datetime
,
date
from
django.utils
import
timezone
import
dateutil.parser
import
dateparser
import
sys
#sys.path.append('/srv/gargantext')
#from admin.env import *
#from parsing.FileParsers.FileParser import FileParser
from
.FileParser
import
FileParser
#from parsing.NgramsExtractors import *
from
..NgramsExtractors
import
*
from
admin.utils
import
PrintException
class
EuropressFileParser_fr
(
FileParser
):
def
_parse
(
self
,
file
):
localeEncoding
=
"fr_FR"
codif
=
"UTF-8"
format_date
=
re
.
compile
(
'.*
\
d{4}.*'
,
re
.
UNICODE
)
def
parse_date
(
date
,
lang
):
d
=
dateparser
.
parse
(
date
.
strip
(),
languages
=
[
lang
])
return
d
if
isinstance
(
file
,
str
):
file
=
open
(
file
,
'rb'
)
contents
=
file
.
read
()
encoding
=
self
.
detect_encoding
(
contents
)
if
encoding
!=
"utf-8"
:
try
:
contents
=
contents
.
decode
(
"latin1"
,
errors
=
'replace'
)
.
encode
(
codif
)
except
:
PrintException
()
html_parser
=
etree
.
HTMLParser
(
encoding
=
codif
)
html
=
etree
.
fromstring
(
contents
,
html_parser
)
html_parser
=
html5parser
.
etree
.
HTMLParser
(
encoding
=
codif
)
html
=
html5parser
.
etree
.
fromstring
(
contents
,
html_parser
)
html_articles
=
html
.
xpath
(
'//article'
)
name_xpath
=
"./header/div/span[@class = 'DocPublicationName']"
header_xpath
=
"./header/div/span[@class = 'DocHeader']"
title_xpath
=
"./header/div[@class='titreArticle']"
text_xpath
=
"./section/div[@class='DocText']/div[@class='docOcurrContainer']/p"
def
scrap_text
(
data_xpath
):
"""
Récupère le texte de toute arborescence
sous une liste de noeuds (par ex liste de <p>)
et renvoie une liste de string
"""
result
=
list
()
# a priori un seul titre ou plusieurs p dans data_xpath
for
elem
in
data_xpath
:
all_text
=
list
()
# on utilise itertext pour avoir
# tous les sous éléments 1 fois
# quelque soit la profondeur
for
sub_txt
in
elem
.
itertext
(
with_tail
=
True
):
sub_txt_clean
=
sub_txt
.
strip
()
if
sub_txt_clean
!=
''
:
all_text
.
append
(
sub_txt_clean
)
result
.
append
(
" "
.
join
(
all_text
))
return
result
# parse all the articles, one by one
try
:
for
html_article
in
html_articles
:
hyperdata
=
{}
try
:
pub_name
=
html_article
.
xpath
(
name_xpath
)[
0
]
.
text
name
=
pub_name
.
split
(
', '
)
hyperdata
[
'journal'
]
=
name
[
0
]
hyperdata
[
'number'
]
=
name
[
1
]
except
:
try
:
hyperdata
[
'journal'
]
=
pub_name
.
strip
()
except
:
pass
header
=
html_article
.
xpath
(
header_xpath
)[
0
]
.
text
if
header
is
not
None
:
header
=
header
.
split
(
', '
)
if
parse_date
(
header
[
0
],
'fr'
)
is
not
None
:
date
=
header
[
0
]
elif
parse_date
(
header
[
1
],
'fr'
)
is
not
None
:
hyperdata
[
'rubrique'
]
=
header
[
0
]
date
=
header
[
1
]
try
:
hyperdata
[
'page'
]
=
header
[
2
]
.
split
(
' '
)[
1
]
except
:
pass
elif
parse_date
(
header
[
2
],
'fr'
)
is
not
None
:
date
=
header
[
2
]
elif
parse_date
(
header
[
0
],
'en'
)
is
not
None
:
date
=
' '
.
join
(
header
[
0
:])
elif
parse_date
(
header
[
1
],
'en'
)
is
not
None
:
date
=
' '
.
join
(
header
[
1
:])
elif
parse_date
(
header
[
2
],
'en'
)
is
not
None
:
date
=
' '
.
join
(
header
[
2
:])
try
:
hyperdata
[
'publication_date'
]
=
dateparser
.
parse
(
date
.
strip
(),
languages
=
[
'fr'
,
'en'
])
except
:
hyperdata
[
'publication_date'
]
=
timezone
.
now
()
.
strftime
(
"
%
Y-
%
m-
%
d
%
H:
%
M:
%
S"
)
hyperdata
[
'publication_year'
]
=
hyperdata
[
'publication_date'
]
.
strftime
(
'
%
Y'
)
hyperdata
[
'publication_month'
]
=
hyperdata
[
'publication_date'
]
.
strftime
(
'
%
m'
)
hyperdata
[
'publication_day'
]
=
hyperdata
[
'publication_date'
]
.
strftime
(
'
%
d'
)
#print(hyperdata['publication_date'])
try
:
title
=
scrap_text
(
html_article
.
xpath
(
title_xpath
))
hyperdata
[
'title'
]
=
title
[
0
]
except
:
pass
try
:
text
=
scrap_text
(
html_article
.
xpath
(
text_xpath
))
hyperdata
[
'abstract'
]
=
'
\n
'
.
join
([
'<p>
\n
'
+
p_text
+
'</p>
\n
'
for
p_text
in
title
[
1
:]
+
text
])
# join([ ' <p> ' + p + ' </p> ' for p in title[1:] + text])
except
:
pass
yield
hyperdata
except
:
PrintException
()
pass
if
__name__
==
"__main__"
:
e
=
EuropressFileParser
()
hyperdata
=
e
.
parse
(
str
(
sys
.
argv
[
1
]))
for
h
in
hyperdata
:
try
:
print
(
h
[
'journal'
],
":"
,
h
[
'publication_date'
])
except
:
pass
parsing/FileParsers/__init__.py
View file @
ba026add
...
...
@@ -3,7 +3,7 @@ from .IsiFileParser import IsiFileParser
from
.JstorFileParser
import
JstorFileParser
from
.ZoteroFileParser
import
ZoteroFileParser
from
.PubmedFileParser
import
PubmedFileParser
from
.EuropressFileParser_en
import
EuropressFileParser_en
from
.EuropressFileParser
_fr
import
EuropressFileParser_f
r
# 2015-12-08: parser 2 en 1
from
.EuropressFileParser
import
EuropressFileParse
r
from
.ISTex
import
ISTex
from
.CSVParser
import
CSVParser
parsing/parsers_config.py
View file @
ba026add
# import * via __init__.py
from
.FileParsers
import
*
parsers
=
{
...
...
@@ -6,9 +7,16 @@ parsers = {
'Scopus (RIS format)'
:
RisFileParser
,
'Zotero (RIS format)'
:
ZoteroFileParser
,
'Jstor (RIS format)'
:
JstorFileParser
,
'Europress (French)'
:
EuropressFileParser
,
'Europress (English)'
:
EuropressFileParser
,
# Une seule entrée pourra remplacer les variantes French/English
# mais (TODO) il faudra juste vérifier cohérence:
# - avec DB: node_resourcetype
# - avec admin/update_corpus.py
#'Europress' : EuropressFileParser,
'Europress (French)'
:
EuropressFileParser_fr
,
'Europress (English)'
:
EuropressFileParser_en
,
'CSVParser'
:
CSVParser
,
'ISTex'
:
ISTex
,
}
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment