Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
gargantext
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
humanities
gargantext
Commits
842efbe4
Commit
842efbe4
authored
Nov 10, 2015
by
delanoe
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
[FEAT] EUROPRESSE PARSER FOR HTML5. still bug with zip files.
parent
ecd6640d
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
401 additions
and
265 deletions
+401
-265
tfidf.py
ngram/tfidf.py
+1
-1
EuropressFileParser.py
parsing/FileParsers/EuropressFileParser.py
+93
-264
EuropressFileParser_old.py
parsing/FileParsers/EuropressFileParser_old.py
+307
-0
No files found.
ngram/tfidf.py
View file @
842efbe4
...
@@ -238,7 +238,7 @@ def compute_tfidf_global(corpus):
...
@@ -238,7 +238,7 @@ def compute_tfidf_global(corpus):
lnD
=
log
(
D
)
lnD
=
log
(
D
)
cursor
.
execute
(
'UPDATE tmp__idf SET idf = idf +
%
f'
%
(
lnD
,
))
cursor
.
execute
(
'UPDATE tmp__idf SET idf = idf +
%
f'
%
(
lnD
,
))
# show off
# show off
dbg
.
show
(
'insert tfidf
for
%
d documents'
%
(
D
,
)
)
dbg
.
show
(
'insert tfidf
'
)
cursor
.
execute
(
'''
cursor
.
execute
(
'''
INSERT INTO
INSERT INTO
%
s (nodex_id, nodey_id, ngram_id, score)
%
s (nodex_id, nodey_id, ngram_id, score)
...
...
parsing/FileParsers/EuropressFileParser.py
View file @
842efbe4
import
re
import
re
import
locale
import
locale
from
lxml
import
etree
from
lxml
import
etree
from
lxml.etree
import
tostring
from
lxml.html
import
html5parser
from
lxml.html
import
html5parser
from
itertools
import
chain
from
datetime
import
datetime
,
date
from
datetime
import
datetime
,
date
from
django.utils
import
timezone
from
django.utils
import
timezone
import
dateutil.parser
import
dateutil.parser
import
dateparser
import
dateparser
import
sys
#sys.path.append('/srv/gargantext')
#from admin.env import *
#from parsing.FileParsers.FileParser import FileParser
from
.FileParser
import
FileParser
from
.FileParser
import
FileParser
#from parsing.NgramsExtractors import *
from
..NgramsExtractors
import
*
from
..NgramsExtractors
import
*
from
admin.utils
import
PrintException
from
admin.utils
import
PrintException
class
EuropressFileParser
(
FileParser
):
class
EuropressFileParser
(
FileParser
):
def
_parse
(
self
,
file
):
def
_parse
(
self
,
file
):
localeEncoding
=
"fr_FR"
localeEncoding
=
"fr_FR"
codif
=
"UTF-8"
codif
=
"UTF-8"
count
=
0
format_date
=
re
.
compile
(
'.*
\
d{4}.*'
,
re
.
UNICODE
)
if
isinstance
(
file
,
str
):
if
isinstance
(
file
,
str
):
file
=
open
(
file
,
'rb'
)
file_open
=
open
(
file
,
'rb'
)
# print(file)
contents
=
file
.
read
()
contents
=
file_open
.
read
()
#print(len(contents))
#return []
encoding
=
self
.
detect_encoding
(
contents
)
encoding
=
self
.
detect_encoding
(
contents
)
#print(encoding)
if
encoding
!=
"utf-8"
:
if
encoding
!=
"utf-8"
:
try
:
try
:
contents
=
contents
.
decode
(
"latin1"
,
errors
=
'replace'
)
.
encode
(
codif
)
contents
=
contents
.
decode
(
"latin1"
,
errors
=
'replace'
)
.
encode
(
codif
)
except
:
except
:
PrintException
()
PrintException
()
# try:
# contents = contents.decode(encoding, errors='replace').encode(codif)
# except Exception as error:
# print(error)
try
:
html_parser
=
etree
.
HTMLParser
(
encoding
=
codif
)
html
=
etree
.
fromstring
(
contents
,
html_parser
)
try
:
html_parser
=
etree
.
HTMLParser
(
encoding
=
codif
)
html
=
etree
.
fromstring
(
contents
,
html_parser
)
format_europresse
=
50
html_parser
=
html5parser
.
etree
.
HTMLParser
(
encoding
=
codif
)
html_articles
=
html
.
xpath
(
'/html/body/table/tbody'
)
html
=
html5parser
.
etree
.
fromstring
(
contents
,
html_parser
)
html_articles
=
html
.
xpath
(
'//article'
)
if
len
(
html_articles
)
<
1
:
html_articles
=
html
.
xpath
(
'/html/body/table'
)
if
len
(
html_articles
)
<
1
:
format_europresse
=
1
html_articles
=
html
.
xpath
(
'//div[@id="docContain"]'
)
if
len
(
html_articles
)
<
1
:
format_europresse
=
50.2
html_parser
=
html5parser
.
etree
.
HTMLParser
(
encoding
=
codif
)
html
=
html5parser
.
etree
.
fromstring
(
contents
,
html_parser
)
html_articles
=
html
.
xpath
(
'//article'
)
if
len
(
html_articles
)
<
1
:
print
(
"no article found"
)
except
:
PrintException
()
if
format_europresse
==
50
:
name_xpath
=
"./tr/td/span[@class = 'DocPublicationName']"
header_xpath
=
"./tr/td/span[@class = 'DocHeader']"
title_xpath
=
"string(./tr/td/span[@class = 'TitreArticleVisu'])"
text_xpath
=
"./tr/td/descendant-or-self::*[not(self::span[@class='DocHeader'])]/text()"
elif
format_europresse
==
1
:
name_xpath
=
"//span[@class = 'DocPublicationName']"
header_xpath
=
"//span[@class = 'DocHeader']"
title_xpath
=
"string(//div[@class = 'titreArticleVisu'])"
text_xpath
=
"./descendant::*[
\
not(
\
self::div[@class='Doc-SourceText']
\
or self::span[@class='DocHeader']
\
or self::span[@class='DocPublicationName']
\
or self::span[@id='docNameVisu']
\
or self::span[@class='DocHeader']
\
or self::div[@class='titreArticleVisu']
\
or self::span[@id='docNameContType']
\
or descendant-or-self::span[@id='ucPubliC_lblCertificatIssuedTo']
\
or descendant-or-self::span[@id='ucPubliC_lblEndDate']
\
or self::td[@class='txtCertificat']
\
)]/text()"
doi_xpath
=
"//span[@id='ucPubliC_lblNodoc']/text()"
elif
format_europresse
==
50.2
:
name_xpath
=
"./header/div/span[@class = 'DocPublicationName']"
header_xpath
=
"./header/div/span[@class = 'DocHeader']"
title_xpath
=
"string(./header/div/span[@class = 'TitreArticleVisu'])"
text_xpath
=
"./header/div/descendant-or-self::*[not(self::span[@class='DocHeader'])]/text()"
name_xpath
=
"./header/div/span[@class = 'DocPublicationName']"
header_xpath
=
"./header/div/span[@class = 'DocHeader']"
title_xpath
=
"./header/div[@class='titreArticle']/descendant-or-self::*"
text_xpath
=
"./section/div[@class='DocText']/descendant-or-self::*"
except
Exception
as
error
:
def
paragraph_list
(
data_xpath
):
PrintException
()
result
=
list
()
for
elem
in
data_xpath
:
if
elem
.
text
is
not
None
:
if
elem
.
text
.
strip
()
!=
''
:
if
elem
.
tag
==
'p'
:
result
.
append
(
elem
.
text
)
else
:
if
len
(
result
)
>
0
:
result
.
append
(
result
.
pop
()
+
elem
.
text
)
else
:
result
.
append
(
elem
.
text
)
return
result
# parse all the articles, one by one
# parse all the articles, one by one
try
:
try
:
for
html_article
in
html_articles
:
for
html_article
in
html_articles
:
hyperdata
=
{}
hyperdata
=
{}
if
len
(
html_article
):
try
:
for
name
in
html_article
.
xpath
(
name_xpath
):
pub_name
=
html_article
.
xpath
(
name_xpath
)[
0
]
.
text
#print("test name.text")
name
=
pub_name
.
split
(
', '
)
if
name
.
text
is
not
None
:
hyperdata
[
'journal'
]
=
name
[
0
]
#print(name.text)
hyperdata
[
'number'
]
=
name
[
1
]
format_journal
=
re
.
compile
(
'(.*), (.*)'
,
re
.
UNICODE
)
except
:
test_journal
=
format_journal
.
match
(
name
.
text
)
if
test_journal
is
not
None
:
hyperdata
[
'journal'
]
=
test_journal
.
group
(
1
)
hyperdata
[
'volume'
]
=
test_journal
.
group
(
2
)
else
:
hyperdata
[
'journal'
]
=
name
.
text
.
encode
(
codif
)
countbis
=
0
for
header
in
html_article
.
xpath
(
header_xpath
):
# print(count)
# countbis += 1
# try:
# print('109', hyperdata['publication_date'])
# except:
# print('no date yet')
# pass
try
:
text
=
header
.
text
print
(
"header"
,
text
)
except
Exception
as
error
:
print
(
error
)
if
isinstance
(
text
,
bytes
):
text
=
text
.
decode
(
encoding
)
if
format_europresse
==
50.2
:
# TODO here check the split if needed: 'Brest Ville, mercredi 26 novembre 2014'
try
:
# # 2015-oct-08 exception added
text
=
text
.
split
(
', '
)[
1
]
except
:
pass
format_date_fr
=
re
.
compile
(
'
\
d*
\
s*
\
w+
\
s+
\
d{4}'
,
re
.
UNICODE
)
format_date_fr_v2
=
re
.
compile
(
'
\
s*
\
w+
\
s+
\
d+
\
s+
\
w+
\
s+
\
d{4}'
,
re
.
UNICODE
)
if
text
is
not
None
:
test_date_fr
=
format_date_fr
.
match
(
text
)
#TODO check the v2 format here
test_date_fr_v2
=
format_date_fr_v2
.
match
(
text
)
format_date_en
=
re
.
compile
(
'
\
w+
\
s+
\
d+,
\
s+
\
d{4}'
,
re
.
UNICODE
)
test_date_en
=
format_date_en
.
match
(
text
)
format_sect
=
re
.
compile
(
'(
\
D+),'
,
re
.
UNICODE
)
test_sect
=
format_sect
.
match
(
text
)
format_page
=
re
.
compile
(
', p. (
\
w+)'
,
re
.
UNICODE
)
test_page
=
format_page
.
match
(
text
)
else
:
test_date_fr
=
None
test_date_fr_v2
=
None
test_date_en
=
None
test_sect
=
None
test_page
=
None
if
test_date_fr
is
not
None
or
test_date_fr_v2
is
not
None
:
self
.
localeEncoding
=
"fr_FR"
locale
.
setlocale
(
locale
.
LC_ALL
,
"fr_FR.utf-8"
)
if
encoding
!=
"utf-8"
:
text
=
text
.
replace
(
'י'
,
'é'
)
text
=
text
.
replace
(
'ű'
,
'û'
)
text
=
text
.
replace
(
' aot '
,
' août '
)
try
:
hyperdata
[
'publication_date'
]
=
dateparser
.
parse
(
text
,
languages
=
[
'fr'
])
except
:
try
:
hyperdata
[
'publication_date'
]
=
datetime
.
strptime
(
text
,
'
%
d
%
B
%
Y'
)
except
:
try
:
hyperdata
[
'publication_date'
]
=
datetime
.
strptime
(
text
,
'
%
B
%
Y'
)
except
:
try
:
locale
.
setlocale
(
locale
.
LC_ALL
,
"fr_FR"
)
hyperdata
[
'publication_date'
]
=
datetime
.
strptime
(
text
,
'
%
d
%
B
%
Y'
)
# hyperdata['publication_date'] = dateutil.parser.parse(text)
except
:
# TODO format to parse: ' mercredi 26 novembre 2014'
try
:
hyperdata
[
'publication_date'
]
=
datetime
.
strptime
(
text
,
'
%
A
%
d
%
B
%
Y'
)
except
Exception
as
error
:
print
(
error
,
text
)
pass
if
test_date_en
is
not
None
:
localeEncoding
=
"en_GB.UTF-8"
locale
.
setlocale
(
locale
.
LC_ALL
,
localeEncoding
)
try
:
hyperdata
[
'publication_date'
]
=
datetime
.
strptime
(
text
,
'
%
B
%
d,
%
Y'
)
except
:
try
:
hyperdata
[
'publication_date'
]
=
datetime
.
strptime
(
text
,
'
%
B
%
Y'
)
except
:
pass
if
test_sect
is
not
None
:
hyperdata
[
'section'
]
=
test_sect
.
group
(
1
)
.
encode
(
codif
)
if
test_page
is
not
None
:
hyperdata
[
'page'
]
=
test_page
.
group
(
1
)
.
encode
(
codif
)
# try:
# print('183', hyperdata['publication_date'])
# except:
# print('no date yet')
# pass
#
hyperdata
[
'title'
]
=
html_article
.
xpath
(
title_xpath
)
.
encode
(
codif
)
hyperdata
[
'abstract'
]
=
html_article
.
xpath
(
text_xpath
)
line
=
0
br_tag
=
10
for
i
in
html_articles
[
count
]
.
iter
():
# print line, br, i, i.tag, i.attrib, i.tail
if
i
.
tag
==
"span"
:
if
"class"
in
i
.
attrib
:
if
i
.
attrib
[
'class'
]
==
'TitreArticleVisu'
:
line
=
1
br_tag
=
2
if
line
==
1
and
i
.
tag
==
"br"
:
br_tag
-=
1
if
line
==
1
and
br_tag
==
0
:
try
:
hyperdata
[
'authors'
]
=
str
.
title
(
etree
.
tostring
(
i
,
method
=
"text"
,
encoding
=
codif
))
.
encode
(
codif
)
#.split(';')
except
:
hyperdata
[
'authors'
]
=
'not found'
line
=
0
br_tag
=
10
try
:
try
:
if
hyperdata
[
'publication_date'
]
is
not
None
or
hyperdata
[
'publication_date'
]
!=
''
:
hyperdata
[
'journal'
]
=
pub_name
.
strip
()
try
:
except
:
back
=
hyperdata
[
'publication_date'
]
pass
except
Exception
as
e
:
#print(e)
pass
header
=
html_article
.
xpath
(
header_xpath
)[
0
]
.
text
else
:
if
header
is
not
None
:
try
:
header
=
header
.
split
(
', '
)
hyperdata
[
'publication_date'
]
=
back
if
format_date
.
match
(
header
[
0
]):
except
Exception
as
e
:
date
=
header
[
0
]
print
(
e
)
except
:
hyperdata
[
'publication_date'
]
=
timezone
.
now
()
#if lang == 'fr':
#hyperdata['language_iso2'] = 'fr'
#elif lang == 'en':
# hyperdata['language_iso2'] = 'en'
hyperdata
[
'publication_year'
]
=
hyperdata
[
'publication_date'
]
.
strftime
(
'
%
Y'
)
hyperdata
[
'publication_month'
]
=
hyperdata
[
'publication_date'
]
.
strftime
(
'
%
m'
)
hyperdata
[
'publication_day'
]
=
hyperdata
[
'publication_date'
]
.
strftime
(
'
%
d'
)
#hyperdata.pop('publication_date')
if
len
(
hyperdata
[
'abstract'
])
>
0
and
format_europresse
==
50
:
hyperdata
[
'doi'
]
=
str
(
hyperdata
[
'abstract'
][
-
9
])
hyperdata
[
'abstract'
]
.
pop
()
# Here add separator for paragraphs
hyperdata
[
'abstract'
]
=
str
(
' '
.
join
(
hyperdata
[
'abstract'
]))
hyperdata
[
'abstract'
]
=
str
(
re
.
sub
(
'Tous droits réservés.*$'
,
''
,
hyperdata
[
'abstract'
]))
elif
format_europresse
==
1
:
hyperdata
[
'doi'
]
=
' '
.
join
(
html_article
.
xpath
(
doi_xpath
))
hyperdata
[
'abstract'
]
=
hyperdata
[
'abstract'
][:
-
9
]
# Here add separator for paragraphs
hyperdata
[
'abstract'
]
=
str
(
' '
.
join
(
hyperdata
[
'abstract'
]))
else
:
else
:
hyperdata
[
'doi'
]
=
"not found"
hyperdata
[
'rubrique'
]
=
header
[
0
]
date
=
header
[
1
]
# try:
# hyperdata['length_words'] = len(hyperdata['abstract'].split(' '))
# except:
# PrintException()
hyperdata
[
'length_letters'
]
=
len
(
hyperdata
[
'abstract'
])
try
:
hyperdata
[
'page'
]
=
header
[
2
]
.
split
(
' '
)[
1
]
except
:
pass
try
:
hyperdata
[
'publication_date'
]
=
dateparser
.
parse
(
date
,
languages
=
[
'fr'
,
'en'
])
except
:
hyperdata
[
'publication_date'
]
=
timezone
.
now
()
try
:
title
=
paragraph_list
(
html_article
.
xpath
(
title_xpath
))
hyperdata
[
'title'
]
=
title
[
0
]
except
:
pass
try
:
text
=
paragraph_list
(
html_article
.
xpath
(
text_xpath
))
hyperdata
[
'abstract'
]
=
' '
.
join
([
' <p> '
+
p
+
' </p> '
for
p
in
title
[
1
:]
+
text
])
except
:
pass
yield
hyperdata
file_open
.
close
()
except
:
PrintException
()
pass
hyperdata
[
'bdd'
]
=
u'europresse'
if
__name__
==
"__main__"
:
hyperdata
[
'url'
]
=
u''
e
=
EuropressFileParser
()
hyperdata
=
e
.
parse
(
str
(
sys
.
argv
[
1
]))
for
h
in
hyperdata
:
try
:
print
(
h
[
'journal'
],
":"
,
h
[
'publication_date'
])
except
:
pass
#hyperdata_str = {}
for
key
,
value
in
hyperdata
.
items
():
hyperdata
[
key
]
=
value
.
decode
()
if
isinstance
(
value
,
bytes
)
else
value
yield
hyperdata
count
+=
1
file
.
close
()
except
Exception
as
error
:
print
(
error
)
pass
parsing/FileParsers/EuropressFileParser_old.py
0 → 100644
View file @
842efbe4
import
re
import
locale
from
lxml
import
etree
from
lxml.html
import
html5parser
from
datetime
import
datetime
,
date
from
django.utils
import
timezone
import
dateutil.parser
import
dateparser
from
.FileParser
import
FileParser
from
..NgramsExtractors
import
*
from
admin.utils
import
PrintException
class
EuropressFileParser
(
FileParser
):
def
_parse
(
self
,
file
):
localeEncoding
=
"fr_FR"
codif
=
"UTF-8"
count
=
0
if
isinstance
(
file
,
str
):
file
=
open
(
file
,
'rb'
)
# print(file)
contents
=
file
.
read
()
#print(len(contents))
#return []
encoding
=
self
.
detect_encoding
(
contents
)
#print(encoding)
if
encoding
!=
"utf-8"
:
try
:
contents
=
contents
.
decode
(
"latin1"
,
errors
=
'replace'
)
.
encode
(
codif
)
except
:
PrintException
()
# try:
# contents = contents.decode(encoding, errors='replace').encode(codif)
# except Exception as error:
# print(error)
try
:
html_parser
=
etree
.
HTMLParser
(
encoding
=
codif
)
html
=
etree
.
fromstring
(
contents
,
html_parser
)
try
:
format_europresse
=
50
html_articles
=
html
.
xpath
(
'/html/body/table/tbody'
)
if
len
(
html_articles
)
<
1
:
html_articles
=
html
.
xpath
(
'/html/body/table'
)
if
len
(
html_articles
)
<
1
:
format_europresse
=
1
html_articles
=
html
.
xpath
(
'//div[@id="docContain"]'
)
if
len
(
html_articles
)
<
1
:
format_europresse
=
50.2
html_parser
=
html5parser
.
etree
.
HTMLParser
(
encoding
=
codif
)
html
=
html5parser
.
etree
.
fromstring
(
contents
,
html_parser
)
html_articles
=
html
.
xpath
(
'//article'
)
if
len
(
html_articles
)
<
1
:
print
(
"no article found"
)
except
:
PrintException
()
if
format_europresse
==
50
:
name_xpath
=
"./tr/td/span[@class = 'DocPublicationName']"
header_xpath
=
"./tr/td/span[@class = 'DocHeader']"
title_xpath
=
"string(./tr/td/span[@class = 'TitreArticleVisu'])"
text_xpath
=
"./tr/td/descendant-or-self::*[not(self::span[@class='DocHeader'])]/text()"
elif
format_europresse
==
1
:
name_xpath
=
"//span[@class = 'DocPublicationName']"
header_xpath
=
"//span[@class = 'DocHeader']"
title_xpath
=
"string(//div[@class = 'titreArticleVisu'])"
text_xpath
=
"./descendant::*[
\
not(
\
self::div[@class='Doc-SourceText']
\
or self::span[@class='DocHeader']
\
or self::span[@class='DocPublicationName']
\
or self::span[@id='docNameVisu']
\
or self::span[@class='DocHeader']
\
or self::div[@class='titreArticleVisu']
\
or self::span[@id='docNameContType']
\
or descendant-or-self::span[@id='ucPubliC_lblCertificatIssuedTo']
\
or descendant-or-self::span[@id='ucPubliC_lblEndDate']
\
or self::td[@class='txtCertificat']
\
)]/text()"
doi_xpath
=
"//span[@id='ucPubliC_lblNodoc']/text()"
elif
format_europresse
==
50.2
:
name_xpath
=
"./header/div/span[@class = 'DocPublicationName']"
header_xpath
=
"./header/div/span[@class = 'DocHeader']"
title_xpath
=
"string(./header/div/span[@class = 'TitreArticleVisu'])"
text_xpath
=
"./header/div/descendant-or-self::*[not(self::span[@class='DocHeader'])]/text()"
except
Exception
as
error
:
PrintException
()
# parse all the articles, one by one
try
:
for
html_article
in
html_articles
:
hyperdata
=
{}
if
len
(
html_article
):
for
name
in
html_article
.
xpath
(
name_xpath
):
#print("test name.text")
if
name
.
text
is
not
None
:
#print(name.text)
format_journal
=
re
.
compile
(
'(.*), (.*)'
,
re
.
UNICODE
)
test_journal
=
format_journal
.
match
(
name
.
text
)
if
test_journal
is
not
None
:
hyperdata
[
'journal'
]
=
test_journal
.
group
(
1
)
hyperdata
[
'volume'
]
=
test_journal
.
group
(
2
)
else
:
hyperdata
[
'journal'
]
=
name
.
text
.
encode
(
codif
)
countbis
=
0
for
header
in
html_article
.
xpath
(
header_xpath
):
# print(count)
# countbis += 1
# try:
# print('109', hyperdata['publication_date'])
# except:
# print('no date yet')
# pass
try
:
text
=
header
.
text
print
(
"header"
,
text
)
except
Exception
as
error
:
print
(
error
)
if
isinstance
(
text
,
bytes
):
text
=
text
.
decode
(
encoding
)
if
format_europresse
==
50.2
:
# TODO here check the split if needed: 'Brest Ville, mercredi 26 novembre 2014'
try
:
# # 2015-oct-08 exception added
text
=
text
.
split
(
', '
)[
1
]
except
:
pass
format_date_fr
=
re
.
compile
(
'
\
d*
\
s*
\
w+
\
s+
\
d{4}'
,
re
.
UNICODE
)
format_date_fr_v2
=
re
.
compile
(
'
\
s*
\
w+
\
s+
\
d+
\
s+
\
w+
\
s+
\
d{4}'
,
re
.
UNICODE
)
if
text
is
not
None
:
test_date_fr
=
format_date_fr
.
match
(
text
)
#TODO check the v2 format here
test_date_fr_v2
=
format_date_fr_v2
.
match
(
text
)
format_date_en
=
re
.
compile
(
'
\
w+
\
s+
\
d+,
\
s+
\
d{4}'
,
re
.
UNICODE
)
test_date_en
=
format_date_en
.
match
(
text
)
format_sect
=
re
.
compile
(
'(
\
D+),'
,
re
.
UNICODE
)
test_sect
=
format_sect
.
match
(
text
)
format_page
=
re
.
compile
(
', p. (
\
w+)'
,
re
.
UNICODE
)
test_page
=
format_page
.
match
(
text
)
else
:
test_date_fr
=
None
test_date_fr_v2
=
None
test_date_en
=
None
test_sect
=
None
test_page
=
None
if
test_date_fr
is
not
None
or
test_date_fr_v2
is
not
None
:
self
.
localeEncoding
=
"fr_FR"
locale
.
setlocale
(
locale
.
LC_ALL
,
"fr_FR.utf-8"
)
if
encoding
!=
"utf-8"
:
text
=
text
.
replace
(
'י'
,
'é'
)
text
=
text
.
replace
(
'ű'
,
'û'
)
text
=
text
.
replace
(
' aot '
,
' août '
)
try
:
hyperdata
[
'publication_date'
]
=
dateparser
.
parse
(
text
,
languages
=
[
'fr'
])
except
:
try
:
hyperdata
[
'publication_date'
]
=
datetime
.
strptime
(
text
,
'
%
d
%
B
%
Y'
)
except
:
try
:
hyperdata
[
'publication_date'
]
=
datetime
.
strptime
(
text
,
'
%
B
%
Y'
)
except
:
try
:
locale
.
setlocale
(
locale
.
LC_ALL
,
"fr_FR"
)
hyperdata
[
'publication_date'
]
=
datetime
.
strptime
(
text
,
'
%
d
%
B
%
Y'
)
# hyperdata['publication_date'] = dateutil.parser.parse(text)
except
:
# TODO format to parse: ' mercredi 26 novembre 2014'
try
:
hyperdata
[
'publication_date'
]
=
datetime
.
strptime
(
text
,
'
%
A
%
d
%
B
%
Y'
)
except
Exception
as
error
:
print
(
error
,
text
)
pass
if
test_date_en
is
not
None
:
localeEncoding
=
"en_GB.UTF-8"
locale
.
setlocale
(
locale
.
LC_ALL
,
localeEncoding
)
try
:
hyperdata
[
'publication_date'
]
=
datetime
.
strptime
(
text
,
'
%
B
%
d,
%
Y'
)
except
:
try
:
hyperdata
[
'publication_date'
]
=
datetime
.
strptime
(
text
,
'
%
B
%
Y'
)
except
:
pass
if
test_sect
is
not
None
:
hyperdata
[
'section'
]
=
test_sect
.
group
(
1
)
.
encode
(
codif
)
if
test_page
is
not
None
:
hyperdata
[
'page'
]
=
test_page
.
group
(
1
)
.
encode
(
codif
)
# try:
# print('183', hyperdata['publication_date'])
# except:
# print('no date yet')
# pass
#
hyperdata
[
'title'
]
=
html_article
.
xpath
(
title_xpath
)
.
encode
(
codif
)
hyperdata
[
'abstract'
]
=
html_article
.
xpath
(
text_xpath
)
line
=
0
br_tag
=
10
for
i
in
html_articles
[
count
]
.
iter
():
# print line, br, i, i.tag, i.attrib, i.tail
if
i
.
tag
==
"span"
:
if
"class"
in
i
.
attrib
:
if
i
.
attrib
[
'class'
]
==
'TitreArticleVisu'
:
line
=
1
br_tag
=
2
if
line
==
1
and
i
.
tag
==
"br"
:
br_tag
-=
1
if
line
==
1
and
br_tag
==
0
:
try
:
hyperdata
[
'authors'
]
=
str
.
title
(
etree
.
tostring
(
i
,
method
=
"text"
,
encoding
=
codif
))
.
encode
(
codif
)
#.split(';')
except
:
hyperdata
[
'authors'
]
=
'not found'
line
=
0
br_tag
=
10
try
:
if
hyperdata
[
'publication_date'
]
is
not
None
or
hyperdata
[
'publication_date'
]
!=
''
:
try
:
back
=
hyperdata
[
'publication_date'
]
except
Exception
as
e
:
#print(e)
pass
else
:
try
:
hyperdata
[
'publication_date'
]
=
back
except
Exception
as
e
:
print
(
e
)
except
:
hyperdata
[
'publication_date'
]
=
timezone
.
now
()
#if lang == 'fr':
#hyperdata['language_iso2'] = 'fr'
#elif lang == 'en':
# hyperdata['language_iso2'] = 'en'
hyperdata
[
'publication_year'
]
=
hyperdata
[
'publication_date'
]
.
strftime
(
'
%
Y'
)
hyperdata
[
'publication_month'
]
=
hyperdata
[
'publication_date'
]
.
strftime
(
'
%
m'
)
hyperdata
[
'publication_day'
]
=
hyperdata
[
'publication_date'
]
.
strftime
(
'
%
d'
)
#hyperdata.pop('publication_date')
if
len
(
hyperdata
[
'abstract'
])
>
0
and
format_europresse
==
50
:
hyperdata
[
'doi'
]
=
str
(
hyperdata
[
'abstract'
][
-
9
])
hyperdata
[
'abstract'
]
.
pop
()
# Here add separator for paragraphs
hyperdata
[
'abstract'
]
=
str
(
' '
.
join
(
hyperdata
[
'abstract'
]))
hyperdata
[
'abstract'
]
=
str
(
re
.
sub
(
'Tous droits réservés.*$'
,
''
,
hyperdata
[
'abstract'
]))
elif
format_europresse
==
1
:
hyperdata
[
'doi'
]
=
' '
.
join
(
html_article
.
xpath
(
doi_xpath
))
hyperdata
[
'abstract'
]
=
hyperdata
[
'abstract'
][:
-
9
]
# Here add separator for paragraphs
hyperdata
[
'abstract'
]
=
str
(
' '
.
join
(
hyperdata
[
'abstract'
]))
else
:
hyperdata
[
'doi'
]
=
"not found"
# try:
# hyperdata['length_words'] = len(hyperdata['abstract'].split(' '))
# except:
# PrintException()
hyperdata
[
'length_letters'
]
=
len
(
hyperdata
[
'abstract'
])
hyperdata
[
'bdd'
]
=
u'europresse'
hyperdata
[
'url'
]
=
u''
#hyperdata_str = {}
for
key
,
value
in
hyperdata
.
items
():
hyperdata
[
key
]
=
value
.
decode
()
if
isinstance
(
value
,
bytes
)
else
value
yield
hyperdata
count
+=
1
file
.
close
()
except
Exception
as
error
:
print
(
error
)
pass
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment