Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
gargantext
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
humanities
gargantext
Commits
76f35de3
Commit
76f35de3
authored
Dec 15, 2015
by
delanoe
Browse files
Options
Browse Files
Download
Plain Diff
[FIX] merge correction import.
parents
2ab53773
90bbffd7
Changes
4
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
158 additions
and
102 deletions
+158
-102
document.js
annotations/static/annotations/document.js
+6
-1
main.html
annotations/templates/annotations/main.html
+10
-0
EuropressFileParser.py
parsing/FileParsers/EuropressFileParser.py
+128
-100
NgramsExtractor.py
parsing/NgramsExtractors/NgramsExtractor.py
+14
-1
No files found.
annotations/static/annotations/document.js
View file @
76f35de3
...
@@ -2,10 +2,13 @@
...
@@ -2,10 +2,13 @@
'use strict'
;
'use strict'
;
var
annotationsAppDocument
=
angular
.
module
(
'annotationsAppDocument'
,
[
'annotationsAppHttp'
]);
var
annotationsAppDocument
=
angular
.
module
(
'annotationsAppDocument'
,
[
'annotationsAppHttp'
]);
annotationsAppDocument
.
controller
(
'DocController'
,
annotationsAppDocument
.
controller
(
'DocController'
,
[
'$scope'
,
'$rootScope'
,
'$timeout'
,
'NgramListHttpService'
,
'DocumentHttpService'
,
[
'$scope'
,
'$rootScope'
,
'$timeout'
,
'NgramListHttpService'
,
'DocumentHttpService'
,
function
(
$scope
,
$rootScope
,
$timeout
,
NgramListHttpService
,
DocumentHttpService
)
{
function
(
$scope
,
$rootScope
,
$timeout
,
NgramListHttpService
,
DocumentHttpService
)
{
// dataLoading = signal pour afficher wait
$scope
.
dataLoading
=
true
;
$rootScope
.
documentResource
=
DocumentHttpService
.
get
(
$rootScope
.
documentResource
=
DocumentHttpService
.
get
(
{
'docId'
:
$rootScope
.
docId
},
{
'docId'
:
$rootScope
.
docId
},
function
(
data
,
responseHeaders
)
{
function
(
data
,
responseHeaders
)
{
...
@@ -27,6 +30,7 @@
...
@@ -27,6 +30,7 @@
function
(
data
)
{
function
(
data
)
{
$rootScope
.
annotations
=
data
[
$rootScope
.
corpusId
.
toString
()][
$rootScope
.
docId
.
toString
()];
$rootScope
.
annotations
=
data
[
$rootScope
.
corpusId
.
toString
()][
$rootScope
.
docId
.
toString
()];
$rootScope
.
lists
=
data
[
$rootScope
.
corpusId
.
toString
()].
lists
;
$rootScope
.
lists
=
data
[
$rootScope
.
corpusId
.
toString
()].
lists
;
$scope
.
dataLoading
=
false
;
},
},
function
(
data
)
{
function
(
data
)
{
console
.
error
(
"unable to get the list of ngrams"
);
console
.
error
(
"unable to get the list of ngrams"
);
...
@@ -34,6 +38,7 @@
...
@@ -34,6 +38,7 @@
);
);
});
});
// TODO setup article pagination
// TODO setup article pagination
$scope
.
onPreviousClick
=
function
()
{
$scope
.
onPreviousClick
=
function
()
{
DocumentHttpService
.
get
(
$scope
.
docId
-
1
);
DocumentHttpService
.
get
(
$scope
.
docId
-
1
);
...
...
annotations/templates/annotations/main.html
View file @
76f35de3
...
@@ -86,6 +86,16 @@
...
@@ -86,6 +86,16 @@
<li
class=
"list-group-item small"
><span
class=
"badge"
>
date
</span>
{[{publication_date}]}
</li>
<li
class=
"list-group-item small"
><span
class=
"badge"
>
date
</span>
{[{publication_date}]}
</li>
</ul>
</ul>
</div>
</div>
<div
ng-if=
"dataLoading"
>
Loading text...
<br>
<center>
<img
width=
"10%"
src=
"{% static 'img/ajax-loader.gif'%}"
></img>
</center>
<br>
</div>
<div
ng-if=
"abstract_text != null"
>
<div
ng-if=
"abstract_text != null"
>
<span
class=
"badge"
>
abstract
</span>
<span
class=
"badge"
>
abstract
</span>
</div>
</div>
...
...
parsing/FileParsers/EuropressFileParser.py
View file @
76f35de3
...
@@ -39,7 +39,7 @@ class EuropressFileParser(FileParser):
...
@@ -39,7 +39,7 @@ class EuropressFileParser(FileParser):
localeEncoding
=
"fr_FR"
localeEncoding
=
"fr_FR"
codif
=
"UTF-8"
codif
=
"UTF-8"
format_page
=
re
.
compile
(
'p
\
. .*'
,
re
.
UNICODE
)
# les docs europresse en/fr
# les docs europresse en/fr
# se distinguent principalement
# se distinguent principalement
# par la forme de leur date
# par la forme de leur date
...
@@ -72,11 +72,21 @@ class EuropressFileParser(FileParser):
...
@@ -72,11 +72,21 @@ class EuropressFileParser(FileParser):
html
=
html5parser
.
etree
.
fromstring
(
contents
,
html_parser
)
html
=
html5parser
.
etree
.
fromstring
(
contents
,
html_parser
)
html_articles
=
html
.
xpath
(
'//article'
)
html_articles
=
html
.
xpath
(
'//article'
)
# all except detail_header are mandatory to parse the article
name_xpath
=
"./header/div/span[@class = 'DocPublicationName']"
name_xpath
=
"./header/div/span[@class = 'DocPublicationName']"
detailed_header_xpath
=
"./header/div/span[@class = 'DocHeader']"
title_xpath
=
"./header/div[@class='titreArticle']"
# title_xpath (cas normal):
text_xpath
=
"./section/div[@class='DocText']//p"
# "./header/div[@class='titreArticle']"
# title_xpath (rapports):
# "./header/div/p[@class='titreArticleVisu grandTitre']"
#
# title_xpath (chemin plus générique)
title_xpath
=
"./header//*[contains(@class,'titreArticle')]"
text_xpath
=
"./section/div[@class='DocText']//p"
entire_header_xpath
=
"./header"
# diagnosed during date retrieval and used for rubrique
detail_header_xpath
=
"./header/div/span[@class = 'DocHeader']"
def
scrap_text
(
data_xpath
):
def
scrap_text
(
data_xpath
):
...
@@ -106,16 +116,40 @@ class EuropressFileParser(FileParser):
...
@@ -106,16 +116,40 @@ class EuropressFileParser(FileParser):
try
:
try
:
for
html_article
in
html_articles
:
for
html_article
in
html_articles
:
# print("2 en 1 ==============================new article")
# print("==============================new article")
# s'il n'y a pas du tout de header on doit skip
all_header
=
html_article
.
xpath
(
entire_header_xpath
)
if
len
(
all_header
)
==
0
:
print
(
"WARNING: europress (skip) article without header"
)
continue
hyperdata
=
{}
hyperdata
=
{}
# analyse de la langue => utile pour la date
# faite localement pour permettre aux utilisateurs
# de choisir ResourceType "Europress" sans s'occuper
# du détail de la langue sourc
doc_language
=
None
# TITLE
# -----
title
=
[]
try
:
title
=
scrap_text
(
html_article
.
xpath
(
title_xpath
))
hyperdata
[
'title'
]
=
title
[
0
]
except
:
# il y aura un problème d'affichage si pas de titre !
print
(
"WARNING: europress (skip) article without title"
)
continue
# FULLTEXT
# --------
try
:
text
=
scrap_text
(
html_article
.
xpath
(
text_xpath
))
hyperdata
[
'abstract'
]
=
'
\n
'
.
join
([
'<p>'
+
p_text
+
'</p>'
for
p_text
in
title
[
1
:]
+
text
])
except
:
pass
# PUBLICATIONNAME
# ----------------
try
:
try
:
pub_name
=
html_article
.
xpath
(
name_xpath
)[
0
]
.
text
pub_name
=
html_article
.
xpath
(
name_xpath
)[
0
]
.
text
name
=
pub_name
.
split
(
', '
)
name
=
pub_name
.
split
(
', '
)
...
@@ -127,115 +161,109 @@ class EuropressFileParser(FileParser):
...
@@ -127,115 +161,109 @@ class EuropressFileParser(FileParser):
except
:
except
:
pass
pass
# span de class DocHeader fournissant rubrique et date
get_dated_header
=
html_article
.
xpath
(
detailed_header_xpath
)
# le detailed_header est occasionnellement absent
# DATE et LANGUAGE
# => FIX TEMPORAIRE: on skippe le document
# ----------------
if
len
(
get_dated_header
)
==
0
or
get_dated_header
[
0
]
.
text
is
None
:
# analyse locale de la langue via le format de la date
print
(
"WARNING (document skip) unformatted europress header"
)
#
continue
# permet de choisir ResourceType "Europress" sans s'occuper
else
:
# du détail de la langue source
header
=
get_dated_header
[
0
]
.
text
doc_language
=
None
date
=
None
# Article detailed headers in europress
# --------------------------------------
# le texte sur lequel on cherchera la date/langue
search_text
=
None
# zone DocHeader fournissant précisément rubrique et date
detailed_text
=
None
get_detail_header
=
html_article
.
xpath
(
detail_header_xpath
)
if
len
(
get_detail_header
)
!=
0
:
# cas le plus courant
# -------------------
# ex: "Seine-Saint-Denis, lundi 28 janvier 2013, p. 93_T_17"
# ex: "Seine-Saint-Denis, lundi 28 janvier 2013, p. 93_T_17"
# ex: "Votre ville, jeudi 6 février 2014"
# ex: "Votre ville, jeudi 6 février 2014"
# ex: "World, Friday, November 13, 2015"
# ex: "World, Friday, November 13, 2015"
detailed_text
=
get_detail_header
[
0
]
.
text
search_text
=
detailed_text
else
:
# occasionellment DocHeader absent
# (on se rabat sur le header entier)
search_text
=
" "
.
join
(
scrap_text
(
all_header
[
0
]))
# 1) test language before splitting
# print("---using all header: '%s'" % search_text)
# on poursuit date/langue avec la zone obtenue
# 1) Une REGEXP identifie la langue ET attrape la date
test_date_fr
=
re
.
search
(
format_date_fr
,
search_text
)
if
test_date_fr
:
doc_language
=
'fr'
# print("=============== Header date fr")
if
re
.
search
(
format_date_fr
,
header
):
# save for FileParser
doc_language
=
'fr'
hyperdata
[
"language_iso2"
]
=
'fr'
# print("=============== Header date fr")
# save for FileParser
hyperdata
[
"language_iso2"
]
=
'fr'
elif
re
.
search
(
format_date_en
,
header
):
# match str
date_str
=
test_date_fr
.
group
()
else
:
# ex: November 7, 2012
test_date_en
=
re
.
search
(
format_date_en
,
search_text
)
if
test_date_en
:
doc_language
=
'en'
doc_language
=
'en'
# print("=============== Header date en")
# print("=============== Header date en")
# save for FileParser
# save for FileParser
hyperdata
[
"language_iso2"
]
=
'en'
hyperdata
[
"language_iso2"
]
=
'en'
# match str
date_str
=
test_date_en
.
group
()
else
:
else
:
print
(
"WARNING europress: echec diagnostic langue header sur '
%
s'"
%
header
)
print
(
"WARNING europress: echec diagnostic
date/
langue header sur '
%
s'"
%
header
)
# default value, used locally, not saved
# default
lg
value, used locally, not saved
doc_language
=
'en'
doc_language
=
'en'
# default date value, will be saved
# attention en anglais la date contient 1 ou 2 virgules
date_str
=
"2016"
# ex: "Tuesday, November 7, 2012"
# ==> dans tous ces cas 'en' dateparser.parse
# 2) we parse the retrieved datestring into a formal date
# sera lancé sur header[i:] et non header[i]
header
=
header
.
split
(
', '
)
# mais dateparser ne veut pas d'éléments autres à la suite de la date
# ==> on filtre les indications de pages qu'europress met souvent après
header
=
list
(
filter
(
lambda
x
:
format_page
.
match
(
x
)
is
None
,
header
))
date
=
None
if
parse_date
(
header
[
0
],
doc_language
)
is
not
None
:
if
doc_language
==
'fr'
:
date
=
header
[
0
]
# print("match 1 fre => 0 = %s " % date)
else
:
date
=
' '
.
join
(
header
[
0
:])
# print("match 0 eng => 0: = %s " % date)
else
:
# most probably news_topic before beginning of date
hyperdata
[
'rubrique'
]
=
header
[
0
]
# [1..last_header_fragment]
for
i
in
range
(
1
,
len
(
header
)):
if
parse_date
(
header
[
i
],
doc_language
)
is
not
None
:
if
doc_language
==
'fr'
:
date
=
header
[
i
]
# print("match %i fre => %i = %s " % (i,i,date))
else
:
date
=
' '
.
join
(
header
[
i
:])
# print("match %i eng => %i: = %s " % (i,i,date))
# default
if
date
is
None
:
date
=
'2016'
# print("no match => 2016")
# we parse the retrieved datestring into a formal date
try
:
try
:
hyperdata
[
'publication_date'
]
=
dateparser
.
parse
(
date
.
strip
(),
doc_language
)
hyperdata
[
'publication_date'
]
=
dateparser
.
parse
(
date_str
.
strip
(),
languages
=
[
doc_language
],
date_formats
=
[
'
%
d
%
B
%
Y'
,
'
%
B
%
d,
%
Y'
]
)
# print("RES POSTPROC:",hyperdata['publication_date'])
# print("RES POSTPROC:",hyperdata['publication_date'])
except
:
except
:
hyperdata
[
'publication_date'
]
=
timezone
.
now
()
.
strftime
(
"
%
Y-
%
m-
%
d
%
H:
%
M:
%
S"
)
hyperdata
[
'publication_date'
]
=
timezone
.
now
()
.
strftime
(
"
%
Y-
%
m-
%
d
%
H:
%
M:
%
S"
)
try
:
hyperdata
[
'publication_year'
]
=
hyperdata
[
'publication_date'
]
.
strftime
(
'
%
Y'
)
hyperdata
[
'publication_month'
]
=
hyperdata
[
'publication_date'
]
.
strftime
(
'
%
m'
)
# infos dérivées
hyperdata
[
'publication_day'
]
=
hyperdata
[
'publication_date'
]
.
strftime
(
'
%
d'
)
hyperdata
[
'publication_year'
]
=
hyperdata
[
'publication_date'
]
.
strftime
(
'
%
Y'
)
except
:
hyperdata
[
'publication_month'
]
=
hyperdata
[
'publication_date'
]
.
strftime
(
'
%
m'
)
print
(
hyperdata
[
'title'
])
hyperdata
[
'publication_day'
]
=
hyperdata
[
'publication_date'
]
.
strftime
(
'
%
d'
)
print
(
date
)
#print(hyperdata['publication_date'])
# RUBRIQUE
try
:
# --------
title
=
scrap_text
(
html_article
.
xpath
(
title_xpath
))
# quand on a le DocHeader détaillé on peut vérifier la rubrique
hyperdata
[
'title'
]
=
title
[
0
]
# (si présente elle est juste avant la date)
except
:
if
detailed_text
is
not
None
:
pass
header_elts
=
detailed_text
.
split
(
', '
)
# on vérifie que le premier élément n'est pas une date ou un fragment de date
try
:
if
parse_date
(
header_elts
[
0
],
doc_language
)
is
None
:
text
=
scrap_text
(
html_article
.
xpath
(
text_xpath
))
# most probably news_topic before beginning of date
hyperdata
[
'abstract'
]
=
'
\n
'
.
join
([
'<p>
\n
'
+
p_text
+
'</p>
\n
'
for
p_text
in
title
[
1
:]
+
text
])
hyperdata
[
'rubrique'
]
=
header_elts
[
0
]
except
:
pass
yield
hyperdata
yield
hyperdata
except
:
except
:
PrintException
()
PrintException
()
pass
pass
...
...
parsing/NgramsExtractors/NgramsExtractor.py
View file @
76f35de3
# from ..Taggers import NltkTagger
# from ..Taggers import NltkTagger
from
..Taggers
import
TurboTagger
from
..Taggers
import
TurboTagger
import
nltk
import
nltk
from
re
import
sub
"""Base class for all ngrams extractors.
"""Base class for all ngrams extractors.
...
@@ -33,9 +34,21 @@ class NgramsExtractor:
...
@@ -33,9 +34,21 @@ class NgramsExtractor:
Returns a list of the ngrams found in the given text.
Returns a list of the ngrams found in the given text.
"""
"""
def
extract_ngrams
(
self
,
contents
):
def
extract_ngrams
(
self
,
contents
):
tagged_tokens
=
list
(
self
.
tagger
.
tag_text
(
contents
))
clean_contents
=
self
.
_prepare_text
(
contents
)
# ici tagging
tagged_tokens
=
list
(
self
.
tagger
.
tag_text
(
clean_contents
))
if
len
(
tagged_tokens
):
if
len
(
tagged_tokens
):
grammar_parsed
=
self
.
_grammar
.
parse
(
tagged_tokens
)
grammar_parsed
=
self
.
_grammar
.
parse
(
tagged_tokens
)
for
subtree
in
grammar_parsed
.
subtrees
():
for
subtree
in
grammar_parsed
.
subtrees
():
if
subtree
.
label
()
==
self
.
_label
:
if
subtree
.
label
()
==
self
.
_label
:
yield
subtree
.
leaves
()
yield
subtree
.
leaves
()
@
staticmethod
def
_prepare_text
(
text_contents
):
"""
Clean the text for better POS tagging
"""
# strip xml tags
return
sub
(
r"<[^>]{0,45}>"
,
""
,
text_contents
)
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment