Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
gargantext
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
humanities
gargantext
Commits
ba026add
Commit
ba026add
authored
Dec 08, 2015
by
Romain Loth
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
Un seul EuropressFileParser pour les deux langues.
parent
5149e7ce
Changes
6
Hide whitespace changes
Inline
Side-by-side
Showing
6 changed files
with
158 additions
and
427 deletions
+158
-427
views_optimized.py
gargantext_web/views_optimized.py
+2
-9
EuropressFileParser.py
parsing/FileParsers/EuropressFileParser.py
+144
-82
EuropressFileParser_en.py
parsing/FileParsers/EuropressFileParser_en.py
+0
-165
EuropressFileParser_fr.py
parsing/FileParsers/EuropressFileParser_fr.py
+0
-167
__init__.py
parsing/FileParsers/__init__.py
+2
-2
parsers_config.py
parsing/parsers_config.py
+10
-2
No files found.
gargantext_web/views_optimized.py
View file @
ba026add
...
@@ -125,21 +125,14 @@ def project(request, project_id):
...
@@ -125,21 +125,14 @@ def project(request, project_id):
thefile
=
form
.
cleaned_data
[
'file'
]
thefile
=
form
.
cleaned_data
[
'file'
]
resourcetype
=
cache
.
ResourceType
[
form
.
cleaned_data
[
'type'
]]
resourcetype
=
cache
.
ResourceType
[
form
.
cleaned_data
[
'type'
]]
# which default language shall be used?
if
resourcetype
.
name
==
"Europress (French)"
:
language_id
=
cache
.
Language
[
'fr'
]
.
id
elif
resourcetype
.
name
==
"Europress (English)"
:
language_id
=
cache
.
Language
[
'en'
]
.
id
else
:
language_id
=
None
# corpus node instanciation as a Django model
# corpus node instanciation as a Django model
corpus
=
Node
(
corpus
=
Node
(
name
=
name
,
name
=
name
,
user_id
=
request
.
user
.
id
,
user_id
=
request
.
user
.
id
,
parent_id
=
project_id
,
parent_id
=
project_id
,
type_id
=
cache
.
NodeType
[
'Corpus'
]
.
id
,
type_id
=
cache
.
NodeType
[
'Corpus'
]
.
id
,
language_id
=
language_id
,
# no default language at this point
language_id
=
None
,
hyperdata
=
{
'Processing'
:
"Parsing documents"
,}
hyperdata
=
{
'Processing'
:
"Parsing documents"
,}
)
)
session
.
add
(
corpus
)
session
.
add
(
corpus
)
...
...
parsing/FileParsers/EuropressFileParser.py
View file @
ba026add
"""
Parses Europress 2015 html format (both for english and french)
=> recognizes language according to date format
=> scraps text for each paragraph to fill hyperdata['abstract']
"""
__author__
=
"Gargantext Team"
__copyright__
=
"Copyright 2014-15 ISCPIF-CNRS"
__version__
=
"0.1"
__email__
=
"romain.loth@iscpif.fr"
__status__
=
"Test"
import
re
import
re
import
locale
import
locale
...
@@ -23,12 +34,24 @@ from ..NgramsExtractors import *
...
@@ -23,12 +34,24 @@ from ..NgramsExtractors import *
from
admin.utils
import
PrintException
from
admin.utils
import
PrintException
class
EuropressFileParser
(
FileParser
):
class
EuropressFileParser
(
FileParser
):
def
_parse_header
(
self
,
header
):
pass
def
_parse
(
self
,
file
):
def
_parse
(
self
,
file
):
localeEncoding
=
"fr_FR"
#print("europr_parser file", file)
codif
=
"UTF-8"
localeEncoding
=
"fr_FR"
codif
=
"UTF-8"
format_page
=
re
.
compile
(
'p
\
. .*'
,
re
.
UNICODE
)
# les docs europresse en/fr
# se distinguent principalement
# par la forme de leur date
# ex: November 7, 2012
format_date_en
=
re
.
compile
(
r'\b(?:January|February|March|April|May|June|July|August|September|October|November|December)\s+[0-3]?\d,\s+(?:19|20)\d\d'
)
# ex: 16 mars 2011
format_date_fr
=
re
.
compile
(
r'[0-3]?\d\s+(?:janvier|février|mars|avril|mai|juin|juillet|août|septembre|octobre|novembre|décembre)\s+(?:19|20)\d\d'
)
def
parse_date
(
date
,
lang
):
d
=
dateparser
.
parse
(
date
.
strip
(),
languages
=
[
lang
])
return
d
if
isinstance
(
file
,
str
):
if
isinstance
(
file
,
str
):
file
=
open
(
file
,
'rb'
)
file
=
open
(
file
,
'rb'
)
...
@@ -52,31 +75,47 @@ class EuropressFileParser(FileParser):
...
@@ -52,31 +75,47 @@ class EuropressFileParser(FileParser):
name_xpath
=
"./header/div/span[@class = 'DocPublicationName']"
name_xpath
=
"./header/div/span[@class = 'DocPublicationName']"
header_xpath
=
"./header/div/span[@class = 'DocHeader']"
header_xpath
=
"./header/div/span[@class = 'DocHeader']"
title_xpath
=
"./header/div[@class='titreArticle']
/descendant-or-self::*
"
title_xpath
=
"./header/div[@class='titreArticle']"
text_xpath
=
"./section/div[@class='DocText']/
descendant-or-self::*
"
text_xpath
=
"./section/div[@class='DocText']/
/p
"
def
paragraph_list
(
data_xpath
):
def
scrap_text
(
data_xpath
):
"""
Récupère le texte de toute arborescence
sous une liste de noeuds (par ex liste de <p>)
et renvoie une liste de string
"""
result
=
list
()
result
=
list
()
# a priori un seul titre ou plusieurs p dans data_xpath
for
elem
in
data_xpath
:
for
elem
in
data_xpath
:
if
elem
.
text
is
not
None
:
all_text
=
list
()
if
elem
.
text
.
strip
()
!=
''
:
# on utilise itertext pour avoir
if
elem
.
tag
==
'p'
:
# tous les sous éléments 1 fois
result
.
append
(
elem
.
text
)
# quelque soit la profondeur
else
:
for
sub_txt
in
elem
.
itertext
(
with_tail
=
True
)
:
if
len
(
result
)
>
0
:
sub_txt_clean
=
sub_txt
.
strip
()
result
.
append
(
result
.
pop
()
+
elem
.
text
)
if
sub_txt_clean
!=
''
:
else
:
all_text
.
append
(
sub_txt_clean
)
result
.
append
(
elem
.
text
)
result
.
append
(
" "
.
join
(
all_text
)
)
return
result
return
result
# parse all the articles, one by one
# parse all the articles, one by one
try
:
try
:
for
html_article
in
html_articles
:
for
html_article
in
html_articles
:
print
(
'article'
)
# print("2 en 1 ==============================new article")
hyperdata
=
{}
hyperdata
=
{}
# analyse de la langue => utile pour la date
# faite localement pour permettre aux utilisateurs
# de choisir ResourceType "Europress" sans s'occuper
# du détail de la langue sourc
doc_language
=
None
try
:
try
:
pub_name
=
html_article
.
xpath
(
name_xpath
)[
0
]
.
text
pub_name
=
html_article
.
xpath
(
name_xpath
)[
0
]
.
text
name
=
pub_name
.
split
(
', '
)
name
=
pub_name
.
split
(
', '
)
...
@@ -87,24 +126,101 @@ class EuropressFileParser(FileParser):
...
@@ -87,24 +126,101 @@ class EuropressFileParser(FileParser):
hyperdata
[
'journal'
]
=
pub_name
.
strip
()
hyperdata
[
'journal'
]
=
pub_name
.
strip
()
except
:
except
:
pass
pass
header
=
html_article
.
xpath
(
header_xpath
)[
0
]
.
text
header
=
html_article
.
xpath
(
header_xpath
)[
0
]
.
text
hyperdata
.
update
(
self
.
_parse_header
(
header
))
if
header
is
not
None
:
# Article headers in europress
hyperdata
[
'publication_year'
]
=
hyperdata
[
'publication_date'
]
.
strftime
(
'
%
Y'
)
# -----------------------------
hyperdata
[
'publication_month'
]
=
hyperdata
[
'publication_date'
]
.
strftime
(
'
%
m'
)
# ex: "Seine-Saint-Denis, lundi 28 janvier 2013, p. 93_T_17"
hyperdata
[
'publication_day'
]
=
hyperdata
[
'publication_date'
]
.
strftime
(
'
%
d'
)
# ex: "Votre ville, jeudi 6 février 2014"
# ex: "World, Friday, November 13, 2015"
# 1) test language before splitting
if
re
.
search
(
format_date_fr
,
header
):
doc_language
=
'fr'
# print("=============== Header date fr")
# save for FileParser
hyperdata
[
"language_iso2"
]
=
'fr'
elif
re
.
search
(
format_date_en
,
header
):
doc_language
=
'en'
# print("=============== Header date en")
# save for FileParser
hyperdata
[
"language_iso2"
]
=
'en'
else
:
print
(
"WARNING europress: echec diagnostic langue header sur '
%
s'"
%
header
)
# default value, used locally, not saved
doc_language
=
'en'
# attention en anglais la date contient 1 ou 2 virgules
# ex: "Tuesday, November 7, 2012"
# ==> dans tous ces cas 'en' dateparser.parse
# sera lancé sur header[i:] et non header[i]
header
=
header
.
split
(
', '
)
# mais dateparser ne veut pas d'éléments autres à la suite de la date
# ==> on filtre les indications de pages qu'europress met souvent après
header
=
list
(
filter
(
lambda
x
:
format_page
.
match
(
x
)
is
None
,
header
))
date
=
None
if
parse_date
(
header
[
0
],
doc_language
)
is
not
None
:
if
doc_language
==
'fr'
:
date
=
header
[
0
]
# print("match 1 fre => 0 = %s " % date)
else
:
date
=
' '
.
join
(
header
[
0
:])
# print("match 0 eng => 0: = %s " % date)
else
:
# most probably news_topic before beginning of date
hyperdata
[
'rubrique'
]
=
header
[
0
]
# [1..last_header_fragment]
for
i
in
range
(
1
,
len
(
header
)):
if
parse_date
(
header
[
i
],
doc_language
)
is
not
None
:
if
doc_language
==
'fr'
:
date
=
header
[
i
]
# print("match %i fre => %i = %s " % (i,i,date))
else
:
date
=
' '
.
join
(
header
[
i
:])
# print("match %i eng => %i: = %s " % (i,i,date))
# default
if
date
is
None
:
date
=
'2016'
# print("no match => 2016")
# we parse the retrieved datestring into a formal date
try
:
hyperdata
[
'publication_date'
]
=
dateparser
.
parse
(
date
.
strip
(),
doc_language
)
# print("RES POSTPROC:",hyperdata['publication_date'])
except
:
hyperdata
[
'publication_date'
]
=
timezone
.
now
()
.
strftime
(
"
%
Y-
%
m-
%
d
%
H:
%
M:
%
S"
)
try
:
try
:
title
=
paragraph_list
(
html_article
.
xpath
(
title_xpath
))
hyperdata
[
'publication_year'
]
=
hyperdata
[
'publication_date'
]
.
strftime
(
'
%
Y'
)
hyperdata
[
'publication_month'
]
=
hyperdata
[
'publication_date'
]
.
strftime
(
'
%
m'
)
hyperdata
[
'publication_day'
]
=
hyperdata
[
'publication_date'
]
.
strftime
(
'
%
d'
)
except
:
print
(
hyperdata
[
'title'
])
print
(
date
)
#print(hyperdata['publication_date'])
try
:
title
=
scrap_text
(
html_article
.
xpath
(
title_xpath
))
hyperdata
[
'title'
]
=
title
[
0
]
hyperdata
[
'title'
]
=
title
[
0
]
except
:
except
:
pass
pass
try
:
try
:
text
=
paragraph_list
(
html_article
.
xpath
(
text_xpath
))
text
=
scrap_text
(
html_article
.
xpath
(
text_xpath
))
hyperdata
[
'abstract'
]
=
' '
.
join
([
' <p> '
+
p
+
' </p> '
for
p
in
title
[
1
:]
+
text
])
hyperdata
[
'abstract'
]
=
'
\n
'
.
join
([
'<p>
\n
'
+
p_text
+
'</p>
\n
'
for
p_text
in
title
[
1
:]
+
text
])
except
:
except
:
pass
pass
...
@@ -114,60 +230,6 @@ class EuropressFileParser(FileParser):
...
@@ -114,60 +230,6 @@ class EuropressFileParser(FileParser):
PrintException
()
PrintException
()
pass
pass
class
EuropressFileParser_fr
(
EuropressFileParser
):
def
_parse_header
(
self
,
header
):
format_date
=
re
.
compile
(
'.*
\
d{4}.*'
,
re
.
UNICODE
)
hyperdata
=
dict
()
if
header
is
not
None
:
header
=
header
.
split
(
', '
)
if
format_date
.
match
(
header
[
0
]):
date
=
header
[
0
]
elif
format_date
.
match
(
header
[
1
]):
hyperdata
[
'rubrique'
]
=
header
[
0
]
date
=
header
[
1
]
try
:
hyperdata
[
'page'
]
=
header
[
2
]
.
split
(
' '
)[
1
]
except
:
pass
else
:
date
=
header
[
2
]
try
:
hyperdata
[
'publication_date'
]
=
dateparser
.
parse
(
date
.
strip
(),
languages
=
[
'fr'
])
except
:
hyperdata
[
'publication_date'
]
=
timezone
.
now
()
.
strftime
(
"
%
Y-
%
m-
%
d
%
H:
%
M:
%
S"
)
return
(
hyperdata
)
#print(hyperdata['publication_date'])
class
EuropressFileParser_en
(
EuropressFileParser
):
def
_parse_header
(
self
,
header
):
format_date
=
re
.
compile
(
'.*
\
d{4}.*'
,
re
.
UNICODE
)
if
header
is
not
None
:
header
=
header
.
split
(
', '
)
if
format_date
.
match
(
header
[
0
]):
date
=
header
[
0
]
elif
format_date
.
match
(
header
[
1
]):
hyperdata
[
'rubrique'
]
=
header
[
0
]
date
=
header
[
1
]
try
:
hyperdata
[
'page'
]
=
header
[
2
]
.
split
(
' '
)[
1
]
except
:
pass
else
:
date
=
header
[
2
]
try
:
hyperdata
[
'publication_date'
]
=
dateparser
.
parse
(
date
.
strip
(),
languages
=
[
'fr'
])
except
:
hyperdata
[
'publication_date'
]
=
timezone
.
now
()
.
strftime
(
"
%
Y-
%
m-
%
d
%
H:
%
M:
%
S"
)
if
__name__
==
"__main__"
:
if
__name__
==
"__main__"
:
e
=
EuropressFileParser
()
e
=
EuropressFileParser
()
hyperdata
=
e
.
parse
(
str
(
sys
.
argv
[
1
]))
hyperdata
=
e
.
parse
(
str
(
sys
.
argv
[
1
]))
...
...
parsing/FileParsers/EuropressFileParser_en.py
deleted
100644 → 0
View file @
5149e7ce
import
re
import
locale
from
lxml
import
etree
from
lxml.etree
import
tostring
from
lxml.html
import
html5parser
from
itertools
import
chain
from
datetime
import
datetime
,
date
from
django.utils
import
timezone
import
dateutil.parser
import
dateparser
import
sys
#sys.path.append('/srv/gargantext')
#from admin.env import *
#from parsing.FileParsers.FileParser import FileParser
from
.FileParser
import
FileParser
#from parsing.NgramsExtractors import *
from
..NgramsExtractors
import
*
from
admin.utils
import
PrintException
class
EuropressFileParser_en
(
FileParser
):
def
_parse
(
self
,
file
):
localeEncoding
=
"fr_FR"
codif
=
"UTF-8"
format_page
=
re
.
compile
(
'p
\
. .*'
,
re
.
UNICODE
)
def
parse_date
(
date
,
lang
):
d
=
dateparser
.
parse
(
date
.
strip
(),
languages
=
[
lang
])
return
d
if
isinstance
(
file
,
str
):
file
=
open
(
file
,
'rb'
)
contents
=
file
.
read
()
encoding
=
self
.
detect_encoding
(
contents
)
if
encoding
!=
"utf-8"
:
try
:
contents
=
contents
.
decode
(
"latin1"
,
errors
=
'replace'
)
.
encode
(
codif
)
except
:
PrintException
()
html_parser
=
etree
.
HTMLParser
(
encoding
=
codif
)
html
=
etree
.
fromstring
(
contents
,
html_parser
)
html_parser
=
html5parser
.
etree
.
HTMLParser
(
encoding
=
codif
)
html
=
html5parser
.
etree
.
fromstring
(
contents
,
html_parser
)
html_articles
=
html
.
xpath
(
'//article'
)
name_xpath
=
"./header/div/span[@class = 'DocPublicationName']"
header_xpath
=
"./header/div/span[@class = 'DocHeader']"
title_xpath
=
"./header/div[@class='titreArticle']"
text_xpath
=
"./section/div[@class='DocText']//p"
def
scrap_text
(
data_xpath
):
"""
Récupère le texte de toute arborescence
sous une liste de noeuds (par ex liste de <p>)
et renvoie une liste de string
"""
result
=
list
()
# a priori un seul titre ou plusieurs p dans data_xpath
for
elem
in
data_xpath
:
all_text
=
list
()
# on utilise itertext pour avoir
# tous les sous éléments 1 fois
# quelque soit la profondeur
for
sub_txt
in
elem
.
itertext
(
with_tail
=
True
):
sub_txt_clean
=
sub_txt
.
strip
()
if
sub_txt_clean
!=
''
:
all_text
.
append
(
sub_txt_clean
)
result
.
append
(
" "
.
join
(
all_text
))
return
result
# parse all the articles, one by one
try
:
for
html_article
in
html_articles
:
hyperdata
=
{}
try
:
pub_name
=
html_article
.
xpath
(
name_xpath
)[
0
]
.
text
name
=
pub_name
.
split
(
', '
)
hyperdata
[
'journal'
]
=
name
[
0
]
hyperdata
[
'number'
]
=
name
[
1
]
except
:
try
:
hyperdata
[
'journal'
]
=
pub_name
.
strip
()
except
:
pass
header
=
html_article
.
xpath
(
header_xpath
)[
0
]
.
text
if
header
is
not
None
:
# attention en anglais la date contient 1 ou 2 virgules
# ex: "Tuesday, November 7, 2012"
# ==> dans tous ces cas 'en' dateparser.parse
# sera lancé sur header[i:] et non header[i]
header
=
header
.
split
(
', '
)
header
=
list
(
filter
(
lambda
x
:
format_page
.
match
(
x
)
is
None
,
header
))
if
parse_date
(
header
[
0
],
'en'
)
is
not
None
:
date
=
' '
.
join
(
header
[
0
:])
elif
parse_date
(
header
[
1
],
'en'
)
is
not
None
:
hyperdata
[
'rubrique'
]
=
header
[
0
]
date
=
' '
.
join
(
header
[
1
:])
elif
parse_date
(
header
[
2
],
'en'
)
is
not
None
:
hyperdata
[
'rubrique'
]
=
header
[
0
]
date
=
' '
.
join
(
header
[
2
:])
elif
parse_date
(
header
[
3
],
'en'
)
is
not
None
:
hyperdata
[
'rubrique'
]
=
header
[
0
]
date
=
' '
.
join
(
header
[
3
:])
else
:
date
=
'2016'
try
:
hyperdata
[
'publication_date'
]
=
dateparser
.
parse
(
date
.
strip
(),
languages
=
[
'en'
])
except
:
hyperdata
[
'publication_date'
]
=
timezone
.
now
()
.
strftime
(
"
%
Y-
%
m-
%
d
%
H:
%
M:
%
S"
)
try
:
hyperdata
[
'publication_year'
]
=
hyperdata
[
'publication_date'
]
.
strftime
(
'
%
Y'
)
hyperdata
[
'publication_month'
]
=
hyperdata
[
'publication_date'
]
.
strftime
(
'
%
m'
)
hyperdata
[
'publication_day'
]
=
hyperdata
[
'publication_date'
]
.
strftime
(
'
%
d'
)
except
:
print
(
hyperdata
[
'title'
])
print
(
date
)
try
:
title
=
scrap_text
(
html_article
.
xpath
(
title_xpath
))
hyperdata
[
'title'
]
=
title
[
0
]
except
:
pass
try
:
text
=
scrap_text
(
html_article
.
xpath
(
text_xpath
))
hyperdata
[
'abstract'
]
=
'
\n
'
.
join
([
'<p>
\n
'
+
p_text
+
'</p>
\n
'
for
p_text
in
title
[
1
:]
+
text
])
except
:
pass
yield
hyperdata
except
:
PrintException
()
pass
if
__name__
==
"__main__"
:
e
=
EuropressFileParser
()
hyperdata
=
e
.
parse
(
str
(
sys
.
argv
[
1
]))
for
h
in
hyperdata
:
try
:
print
(
h
[
'journal'
],
":"
,
h
[
'publication_date'
])
except
:
pass
parsing/FileParsers/EuropressFileParser_fr.py
deleted
100644 → 0
View file @
5149e7ce
import
re
import
locale
from
lxml
import
etree
from
lxml.etree
import
tostring
from
lxml.html
import
html5parser
from
itertools
import
chain
from
datetime
import
datetime
,
date
from
django.utils
import
timezone
import
dateutil.parser
import
dateparser
import
sys
#sys.path.append('/srv/gargantext')
#from admin.env import *
#from parsing.FileParsers.FileParser import FileParser
from
.FileParser
import
FileParser
#from parsing.NgramsExtractors import *
from
..NgramsExtractors
import
*
from
admin.utils
import
PrintException
class
EuropressFileParser_fr
(
FileParser
):
def
_parse
(
self
,
file
):
localeEncoding
=
"fr_FR"
codif
=
"UTF-8"
format_date
=
re
.
compile
(
'.*
\
d{4}.*'
,
re
.
UNICODE
)
def
parse_date
(
date
,
lang
):
d
=
dateparser
.
parse
(
date
.
strip
(),
languages
=
[
lang
])
return
d
if
isinstance
(
file
,
str
):
file
=
open
(
file
,
'rb'
)
contents
=
file
.
read
()
encoding
=
self
.
detect_encoding
(
contents
)
if
encoding
!=
"utf-8"
:
try
:
contents
=
contents
.
decode
(
"latin1"
,
errors
=
'replace'
)
.
encode
(
codif
)
except
:
PrintException
()
html_parser
=
etree
.
HTMLParser
(
encoding
=
codif
)
html
=
etree
.
fromstring
(
contents
,
html_parser
)
html_parser
=
html5parser
.
etree
.
HTMLParser
(
encoding
=
codif
)
html
=
html5parser
.
etree
.
fromstring
(
contents
,
html_parser
)
html_articles
=
html
.
xpath
(
'//article'
)
name_xpath
=
"./header/div/span[@class = 'DocPublicationName']"
header_xpath
=
"./header/div/span[@class = 'DocHeader']"
title_xpath
=
"./header/div[@class='titreArticle']"
text_xpath
=
"./section/div[@class='DocText']/div[@class='docOcurrContainer']/p"
def
scrap_text
(
data_xpath
):
"""
Récupère le texte de toute arborescence
sous une liste de noeuds (par ex liste de <p>)
et renvoie une liste de string
"""
result
=
list
()
# a priori un seul titre ou plusieurs p dans data_xpath
for
elem
in
data_xpath
:
all_text
=
list
()
# on utilise itertext pour avoir
# tous les sous éléments 1 fois
# quelque soit la profondeur
for
sub_txt
in
elem
.
itertext
(
with_tail
=
True
):
sub_txt_clean
=
sub_txt
.
strip
()
if
sub_txt_clean
!=
''
:
all_text
.
append
(
sub_txt_clean
)
result
.
append
(
" "
.
join
(
all_text
))
return
result
# parse all the articles, one by one
try
:
for
html_article
in
html_articles
:
hyperdata
=
{}
try
:
pub_name
=
html_article
.
xpath
(
name_xpath
)[
0
]
.
text
name
=
pub_name
.
split
(
', '
)
hyperdata
[
'journal'
]
=
name
[
0
]
hyperdata
[
'number'
]
=
name
[
1
]
except
:
try
:
hyperdata
[
'journal'
]
=
pub_name
.
strip
()
except
:
pass
header
=
html_article
.
xpath
(
header_xpath
)[
0
]
.
text
if
header
is
not
None
:
header
=
header
.
split
(
', '
)
if
parse_date
(
header
[
0
],
'fr'
)
is
not
None
:
date
=
header
[
0
]
elif
parse_date
(
header
[
1
],
'fr'
)
is
not
None
:
hyperdata
[
'rubrique'
]
=
header
[
0
]
date
=
header
[
1
]
try
:
hyperdata
[
'page'
]
=
header
[
2
]
.
split
(
' '
)[
1
]
except
:
pass
elif
parse_date
(
header
[
2
],
'fr'
)
is
not
None
:
date
=
header
[
2
]
elif
parse_date
(
header
[
0
],
'en'
)
is
not
None
:
date
=
' '
.
join
(
header
[
0
:])
elif
parse_date
(
header
[
1
],
'en'
)
is
not
None
:
date
=
' '
.
join
(
header
[
1
:])
elif
parse_date
(
header
[
2
],
'en'
)
is
not
None
:
date
=
' '
.
join
(
header
[
2
:])
try
:
hyperdata
[
'publication_date'
]
=
dateparser
.
parse
(
date
.
strip
(),
languages
=
[
'fr'
,
'en'
])
except
:
hyperdata
[
'publication_date'
]
=
timezone
.
now
()
.
strftime
(
"
%
Y-
%
m-
%
d
%
H:
%
M:
%
S"
)
hyperdata
[
'publication_year'
]
=
hyperdata
[
'publication_date'
]
.
strftime
(
'
%
Y'
)
hyperdata
[
'publication_month'
]
=
hyperdata
[
'publication_date'
]
.
strftime
(
'
%
m'
)
hyperdata
[
'publication_day'
]
=
hyperdata
[
'publication_date'
]
.
strftime
(
'
%
d'
)
#print(hyperdata['publication_date'])
try
:
title
=
scrap_text
(
html_article
.
xpath
(
title_xpath
))
hyperdata
[
'title'
]
=
title
[
0
]
except
:
pass
try
:
text
=
scrap_text
(
html_article
.
xpath
(
text_xpath
))
hyperdata
[
'abstract'
]
=
'
\n
'
.
join
([
'<p>
\n
'
+
p_text
+
'</p>
\n
'
for
p_text
in
title
[
1
:]
+
text
])
# join([ ' <p> ' + p + ' </p> ' for p in title[1:] + text])
except
:
pass
yield
hyperdata
except
:
PrintException
()
pass
if
__name__
==
"__main__"
:
e
=
EuropressFileParser
()
hyperdata
=
e
.
parse
(
str
(
sys
.
argv
[
1
]))
for
h
in
hyperdata
:
try
:
print
(
h
[
'journal'
],
":"
,
h
[
'publication_date'
])
except
:
pass
parsing/FileParsers/__init__.py
View file @
ba026add
...
@@ -3,7 +3,7 @@ from .IsiFileParser import IsiFileParser
...
@@ -3,7 +3,7 @@ from .IsiFileParser import IsiFileParser
from
.JstorFileParser
import
JstorFileParser
from
.JstorFileParser
import
JstorFileParser
from
.ZoteroFileParser
import
ZoteroFileParser
from
.ZoteroFileParser
import
ZoteroFileParser
from
.PubmedFileParser
import
PubmedFileParser
from
.PubmedFileParser
import
PubmedFileParser
from
.EuropressFileParser_en
import
EuropressFileParser_en
# 2015-12-08: parser 2 en 1
from
.EuropressFileParser
_fr
import
EuropressFileParser_f
r
from
.EuropressFileParser
import
EuropressFileParse
r
from
.ISTex
import
ISTex
from
.ISTex
import
ISTex
from
.CSVParser
import
CSVParser
from
.CSVParser
import
CSVParser
parsing/parsers_config.py
View file @
ba026add
# import * via __init__.py
from
.FileParsers
import
*
from
.FileParsers
import
*
parsers
=
{
parsers
=
{
...
@@ -6,9 +7,16 @@ parsers = {
...
@@ -6,9 +7,16 @@ parsers = {
'Scopus (RIS format)'
:
RisFileParser
,
'Scopus (RIS format)'
:
RisFileParser
,
'Zotero (RIS format)'
:
ZoteroFileParser
,
'Zotero (RIS format)'
:
ZoteroFileParser
,
'Jstor (RIS format)'
:
JstorFileParser
,
'Jstor (RIS format)'
:
JstorFileParser
,
'Europress (French)'
:
EuropressFileParser
,
'Europress (English)'
:
EuropressFileParser
,
# Une seule entrée pourra remplacer les variantes French/English
# mais (TODO) il faudra juste vérifier cohérence:
# - avec DB: node_resourcetype
# - avec admin/update_corpus.py
#'Europress' : EuropressFileParser,
#'Europress' : EuropressFileParser,
'Europress (French)'
:
EuropressFileParser_fr
,
'Europress (English)'
:
EuropressFileParser_en
,
'CSVParser'
:
CSVParser
,
'CSVParser'
:
CSVParser
,
'ISTex'
:
ISTex
,
'ISTex'
:
ISTex
,
}
}
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment