Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
gargantext
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
humanities
gargantext
Commits
fb8ca4ec
Commit
fb8ca4ec
authored
Sep 05, 2016
by
Romain Loth
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
parsers: fix europresse exceptions handling
parent
e6ca0150
Changes
1
Show whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
30 additions
and
14 deletions
+30
-14
EUROPRESSE.py
gargantext/util/parsers/EUROPRESSE.py
+30
-14
No files found.
gargantext/util/parsers/EUROPRESSE.py
View file @
fb8ca4ec
...
...
@@ -4,8 +4,8 @@
"""
__author__
=
"Gargantext Team"
__copyright__
=
"Copyright 2014-1
5
ISCPIF-CNRS"
__version__
=
"0.
1
"
__copyright__
=
"Copyright 2014-1
6
ISCPIF-CNRS"
__version__
=
"0.
2
"
__email__
=
"romain.loth@iscpif.fr"
__status__
=
"Test"
...
...
@@ -27,6 +27,7 @@ import sys
#from admin.env import *
#from parsing.FileParsers.FileParser import FileParser
from
._Parser
import
Parser
from
traceback
import
format_tb
class
EuropresseParser
(
Parser
):
...
...
@@ -112,15 +113,17 @@ class EuropresseParser(Parser):
# parse all the articles, one by one
try
:
for
html_article
in
html_articles
:
try
:
print
(
"==============================new article"
)
# s'il n'y a pas du tout de header on doit skip
all_header
=
html_article
.
xpath
(
entire_header_xpath
)
if
len
(
all_header
)
==
0
:
print
(
"WARNING: europress (skip) article without header"
)
all_header_text
=
" "
.
join
(
scrap_text
(
all_header
))
if
len
(
all_header
)
==
0
or
len
(
all_header_text
)
==
0
:
hyperdata
[
'error'
]
=
"Europresse: html doc with no header"
yield
(
hyperdata
)
print
(
"WARNING: europresse (skip) article without header"
)
continue
hyperdata
=
{}
...
...
@@ -134,7 +137,9 @@ class EuropresseParser(Parser):
hyperdata
[
'title'
]
=
title
[
0
]
except
:
# il y aura un problème d'affichage si pas de titre !
print
(
"WARNING: europress (skip) article without title"
)
print
(
"WARNING: europresse (skip) article without title"
)
hyperdata
[
'error'
]
=
"Europresse: doc with no title"
yield
(
hyperdata
)
continue
...
...
@@ -189,12 +194,19 @@ class EuropresseParser(Parser):
else
:
# occasionellment DocHeader absent
# (on se rabat sur le header entier)
search_text
=
" "
.
join
(
scrap_text
(
all_header
[
0
]))
search_text
=
all_header_text
# print("---using all header: '%s'" % search_text)
# si on n'a pas trouvé de zone du tout
if
not
search_text
:
the_err
=
"europresse (skip) doc without detailed header"
print
(
"WARNING:"
+
the_err
)
hyperdata
[
'error'
]
=
the_err
yield
(
hyperdata
)
continue
# on poursuit date/langue avec la zone obtenue
# on poursuit date/langue avec la zone obtenue
...
# 1) Une REGEXP identifie la langue ET attrape la date
test_date_fr
=
re
.
search
(
format_date_fr
,
search_text
)
...
...
@@ -223,7 +235,7 @@ class EuropresseParser(Parser):
# match str
date_str
=
test_date_en
.
group
()
else
:
print
(
"WARNING europress: echec diagnostic date/langue header sur '
%
s'"
%
header
)
print
(
"WARNING europress
e
: echec diagnostic date/langue header sur '
%
s'"
%
header
)
# default lg value, used locally, not saved
doc_language
=
'en'
# default date value, will be saved
...
...
@@ -260,8 +272,12 @@ class EuropresseParser(Parser):
# most probably news_topic before beginning of date
hyperdata
[
'rubrique'
]
=
header_elts
[
0
]
print
(
hyperdata
)
#
print(hyperdata)
yield
hyperdata
except
Exception
as
err
:
print
(
'Europresse parser: Something bad happened:'
+
str
(
err
))
print
(
'WARNING: europresse (skip) unknown error:"'
+
str
(
err
)
+
'"'
+
"
\n
>>>"
+
(
">>>"
.
join
(
format_tb
(
err
.
__traceback__
))))
hyperdata
[
'error'
]
=
err
yield
(
hyperdata
)
continue
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment