Commit 76f35de3 authored by delanoe's avatar delanoe

[FIX] merge correction import.

parents 2ab53773 90bbffd7
...@@ -2,10 +2,13 @@ ...@@ -2,10 +2,13 @@
'use strict'; 'use strict';
var annotationsAppDocument = angular.module('annotationsAppDocument', ['annotationsAppHttp']); var annotationsAppDocument = angular.module('annotationsAppDocument', ['annotationsAppHttp']);
annotationsAppDocument.controller('DocController', annotationsAppDocument.controller('DocController',
['$scope', '$rootScope', '$timeout', 'NgramListHttpService', 'DocumentHttpService', ['$scope', '$rootScope', '$timeout', 'NgramListHttpService', 'DocumentHttpService',
function ($scope, $rootScope, $timeout, NgramListHttpService, DocumentHttpService) { function ($scope, $rootScope, $timeout, NgramListHttpService, DocumentHttpService) {
// dataLoading = signal pour afficher wait
$scope.dataLoading = true ;
$rootScope.documentResource = DocumentHttpService.get( $rootScope.documentResource = DocumentHttpService.get(
{'docId': $rootScope.docId}, {'docId': $rootScope.docId},
function(data, responseHeaders) { function(data, responseHeaders) {
...@@ -27,6 +30,7 @@ ...@@ -27,6 +30,7 @@
function(data) { function(data) {
$rootScope.annotations = data[$rootScope.corpusId.toString()][$rootScope.docId.toString()]; $rootScope.annotations = data[$rootScope.corpusId.toString()][$rootScope.docId.toString()];
$rootScope.lists = data[$rootScope.corpusId.toString()].lists; $rootScope.lists = data[$rootScope.corpusId.toString()].lists;
$scope.dataLoading = false ;
}, },
function(data) { function(data) {
console.error("unable to get the list of ngrams"); console.error("unable to get the list of ngrams");
...@@ -34,6 +38,7 @@ ...@@ -34,6 +38,7 @@
); );
}); });
// TODO setup article pagination // TODO setup article pagination
$scope.onPreviousClick = function () { $scope.onPreviousClick = function () {
DocumentHttpService.get($scope.docId - 1); DocumentHttpService.get($scope.docId - 1);
......
...@@ -86,6 +86,16 @@ ...@@ -86,6 +86,16 @@
<li class="list-group-item small"><span class="badge">date</span>{[{publication_date}]}</li> <li class="list-group-item small"><span class="badge">date</span>{[{publication_date}]}</li>
</ul> </ul>
</div> </div>
<div ng-if="dataLoading">
Loading text...
<br>
<center>
<img width="10%" src="{% static 'img/ajax-loader.gif'%}"></img>
</center>
<br>
</div>
<div ng-if="abstract_text != null"> <div ng-if="abstract_text != null">
<span class="badge">abstract</span> <span class="badge">abstract</span>
</div> </div>
......
...@@ -39,7 +39,7 @@ class EuropressFileParser(FileParser): ...@@ -39,7 +39,7 @@ class EuropressFileParser(FileParser):
localeEncoding = "fr_FR" localeEncoding = "fr_FR"
codif = "UTF-8" codif = "UTF-8"
format_page = re.compile('p\. .*', re.UNICODE)
# les docs europresse en/fr # les docs europresse en/fr
# se distinguent principalement # se distinguent principalement
# par la forme de leur date # par la forme de leur date
...@@ -72,11 +72,21 @@ class EuropressFileParser(FileParser): ...@@ -72,11 +72,21 @@ class EuropressFileParser(FileParser):
html = html5parser.etree.fromstring(contents, html_parser) html = html5parser.etree.fromstring(contents, html_parser)
html_articles = html.xpath('//article') html_articles = html.xpath('//article')
# all except detail_header are mandatory to parse the article
name_xpath = "./header/div/span[@class = 'DocPublicationName']" name_xpath = "./header/div/span[@class = 'DocPublicationName']"
detailed_header_xpath = "./header/div/span[@class = 'DocHeader']"
title_xpath = "./header/div[@class='titreArticle']" # title_xpath (cas normal):
text_xpath = "./section/div[@class='DocText']//p" # "./header/div[@class='titreArticle']"
# title_xpath (rapports):
# "./header/div/p[@class='titreArticleVisu grandTitre']"
#
# title_xpath (chemin plus générique)
title_xpath = "./header//*[contains(@class,'titreArticle')]"
text_xpath = "./section/div[@class='DocText']//p"
entire_header_xpath = "./header"
# diagnosed during date retrieval and used for rubrique
detail_header_xpath = "./header/div/span[@class = 'DocHeader']"
def scrap_text(data_xpath): def scrap_text(data_xpath):
...@@ -106,16 +116,40 @@ class EuropressFileParser(FileParser): ...@@ -106,16 +116,40 @@ class EuropressFileParser(FileParser):
try: try:
for html_article in html_articles: for html_article in html_articles:
# print("2 en 1 ==============================new article") # print("==============================new article")
# s'il n'y a pas du tout de header on doit skip
all_header = html_article.xpath(entire_header_xpath)
if len(all_header) == 0:
print("WARNING: europress (skip) article without header")
continue
hyperdata = {} hyperdata = {}
# analyse de la langue => utile pour la date
# faite localement pour permettre aux utilisateurs
# de choisir ResourceType "Europress" sans s'occuper
# du détail de la langue sourc
doc_language = None
# TITLE
# -----
title = []
try:
title = scrap_text(html_article.xpath(title_xpath))
hyperdata['title'] = title[0]
except:
# il y aura un problème d'affichage si pas de titre !
print("WARNING: europress (skip) article without title")
continue
# FULLTEXT
# --------
try:
text = scrap_text(html_article.xpath(text_xpath))
hyperdata['abstract'] = '\n'.join([ '<p>'+p_text+'</p>' for p_text in title[1:] + text])
except:
pass
# PUBLICATIONNAME
# ----------------
try: try:
pub_name = html_article.xpath(name_xpath)[0].text pub_name = html_article.xpath(name_xpath)[0].text
name = pub_name.split(', ') name = pub_name.split(', ')
...@@ -127,115 +161,109 @@ class EuropressFileParser(FileParser): ...@@ -127,115 +161,109 @@ class EuropressFileParser(FileParser):
except: except:
pass pass
# span de class DocHeader fournissant rubrique et date
get_dated_header = html_article.xpath(detailed_header_xpath)
# le detailed_header est occasionnellement absent # DATE et LANGUAGE
# => FIX TEMPORAIRE: on skippe le document # ----------------
if len(get_dated_header) == 0 or get_dated_header[0].text is None: # analyse locale de la langue via le format de la date
print("WARNING (document skip) unformatted europress header") #
continue # permet de choisir ResourceType "Europress" sans s'occuper
else: # du détail de la langue source
header = get_dated_header[0].text doc_language = None
date = None
# Article detailed headers in europress
# -------------------------------------- # le texte sur lequel on cherchera la date/langue
search_text = None
# zone DocHeader fournissant précisément rubrique et date
detailed_text = None
get_detail_header = html_article.xpath(detail_header_xpath)
if len(get_detail_header) != 0:
# cas le plus courant
# -------------------
# ex: "Seine-Saint-Denis, lundi 28 janvier 2013, p. 93_T_17" # ex: "Seine-Saint-Denis, lundi 28 janvier 2013, p. 93_T_17"
# ex: "Votre ville, jeudi 6 février 2014" # ex: "Votre ville, jeudi 6 février 2014"
# ex: "World, Friday, November 13, 2015" # ex: "World, Friday, November 13, 2015"
detailed_text = get_detail_header[0].text
search_text = detailed_text
else:
# occasionellment DocHeader absent
# (on se rabat sur le header entier)
search_text = " ".join(scrap_text(all_header[0]))
# 1) test language before splitting # print("---using all header: '%s'" % search_text)
# on poursuit date/langue avec la zone obtenue
# 1) Une REGEXP identifie la langue ET attrape la date
test_date_fr = re.search(format_date_fr,search_text)
if test_date_fr:
doc_language = 'fr'
# print("=============== Header date fr")
if re.search(format_date_fr,header): # save for FileParser
doc_language = 'fr' hyperdata["language_iso2"] = 'fr'
# print("=============== Header date fr")
# save for FileParser
hyperdata["language_iso2"] = 'fr'
elif re.search(format_date_en,header): # match str
date_str = test_date_fr.group()
else:
# ex: November 7, 2012
test_date_en = re.search(format_date_en,search_text)
if test_date_en:
doc_language = 'en' doc_language = 'en'
# print("=============== Header date en") # print("=============== Header date en")
# save for FileParser # save for FileParser
hyperdata["language_iso2"] = 'en' hyperdata["language_iso2"] = 'en'
# match str
date_str = test_date_en.group()
else: else:
print("WARNING europress: echec diagnostic langue header sur '%s'" % header) print("WARNING europress: echec diagnostic date/langue header sur '%s'" % header)
# default value, used locally, not saved # default lg value, used locally, not saved
doc_language = 'en' doc_language = 'en'
# default date value, will be saved
# attention en anglais la date contient 1 ou 2 virgules date_str = "2016"
# ex: "Tuesday, November 7, 2012"
# ==> dans tous ces cas 'en' dateparser.parse # 2) we parse the retrieved datestring into a formal date
# sera lancé sur header[i:] et non header[i]
header = header.split(', ')
# mais dateparser ne veut pas d'éléments autres à la suite de la date
# ==> on filtre les indications de pages qu'europress met souvent après
header = list(filter(lambda x: format_page.match(x) is None, header))
date = None
if parse_date(header[0], doc_language) is not None:
if doc_language == 'fr':
date = header[0]
# print("match 1 fre => 0 = %s " % date)
else:
date = ' '.join(header[0:])
# print("match 0 eng => 0: = %s " % date)
else:
# most probably news_topic before beginning of date
hyperdata['rubrique'] = header[0]
# [1..last_header_fragment]
for i in range(1,len(header)):
if parse_date(header[i], doc_language) is not None:
if doc_language == 'fr':
date = header[i]
# print("match %i fre => %i = %s " % (i,i,date))
else:
date = ' '.join(header[i:])
# print("match %i eng => %i: = %s " % (i,i,date))
# default
if date is None:
date = '2016'
# print("no match => 2016")
# we parse the retrieved datestring into a formal date
try: try:
hyperdata['publication_date'] = dateparser.parse(date.strip(), doc_language) hyperdata['publication_date'] = dateparser.parse(
date_str.strip(),
languages=[doc_language],
date_formats=['%d %B %Y','%B %d, %Y']
)
# print("RES POSTPROC:",hyperdata['publication_date']) # print("RES POSTPROC:",hyperdata['publication_date'])
except: except:
hyperdata['publication_date'] = timezone.now().strftime("%Y-%m-%d %H:%M:%S") hyperdata['publication_date'] = timezone.now().strftime("%Y-%m-%d %H:%M:%S")
try:
hyperdata['publication_year'] = hyperdata['publication_date'].strftime('%Y')
hyperdata['publication_month'] = hyperdata['publication_date'].strftime('%m') # infos dérivées
hyperdata['publication_day'] = hyperdata['publication_date'].strftime('%d') hyperdata['publication_year'] = hyperdata['publication_date'].strftime('%Y')
except: hyperdata['publication_month'] = hyperdata['publication_date'].strftime('%m')
print(hyperdata['title']) hyperdata['publication_day'] = hyperdata['publication_date'].strftime('%d')
print(date)
#print(hyperdata['publication_date'])
# RUBRIQUE
try: # --------
title = scrap_text(html_article.xpath(title_xpath)) # quand on a le DocHeader détaillé on peut vérifier la rubrique
hyperdata['title'] = title[0] # (si présente elle est juste avant la date)
except: if detailed_text is not None:
pass header_elts = detailed_text.split(', ')
# on vérifie que le premier élément n'est pas une date ou un fragment de date
try: if parse_date(header_elts[0], doc_language) is None:
text = scrap_text(html_article.xpath(text_xpath)) # most probably news_topic before beginning of date
hyperdata['abstract'] = '\n'.join([ '<p>\n'+p_text+'</p>\n' for p_text in title[1:] + text]) hyperdata['rubrique'] = header_elts[0]
except:
pass
yield hyperdata yield hyperdata
except : except:
PrintException() PrintException()
pass pass
......
# from ..Taggers import NltkTagger # from ..Taggers import NltkTagger
from ..Taggers import TurboTagger from ..Taggers import TurboTagger
import nltk import nltk
from re import sub
"""Base class for all ngrams extractors. """Base class for all ngrams extractors.
...@@ -33,9 +34,21 @@ class NgramsExtractor: ...@@ -33,9 +34,21 @@ class NgramsExtractor:
Returns a list of the ngrams found in the given text. Returns a list of the ngrams found in the given text.
""" """
def extract_ngrams(self, contents): def extract_ngrams(self, contents):
tagged_tokens = list(self.tagger.tag_text(contents)) clean_contents = self._prepare_text(contents)
# ici tagging
tagged_tokens = list(self.tagger.tag_text(clean_contents))
if len(tagged_tokens): if len(tagged_tokens):
grammar_parsed = self._grammar.parse(tagged_tokens) grammar_parsed = self._grammar.parse(tagged_tokens)
for subtree in grammar_parsed.subtrees(): for subtree in grammar_parsed.subtrees():
if subtree.label() == self._label: if subtree.label() == self._label:
yield subtree.leaves() yield subtree.leaves()
@staticmethod
def _prepare_text(text_contents):
"""
Clean the text for better POS tagging
"""
# strip xml tags
return sub(r"<[^>]{0,45}>","",text_contents)
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment