Commit d87916f9 authored by delanoe's avatar delanoe

Merge branch 'romain' into unstable

parents 485f2b02 d5a8e664
...@@ -125,21 +125,14 @@ def project(request, project_id): ...@@ -125,21 +125,14 @@ def project(request, project_id):
thefile = form.cleaned_data['file'] thefile = form.cleaned_data['file']
resourcetype = cache.ResourceType[form.cleaned_data['type']] resourcetype = cache.ResourceType[form.cleaned_data['type']]
# which default language shall be used?
if resourcetype.name == "Europress (French)":
language_id = cache.Language['fr'].id
elif resourcetype.name == "Europress (English)":
language_id = cache.Language['en'].id
else:
language_id = None
# corpus node instanciation as a Django model # corpus node instanciation as a Django model
corpus = Node( corpus = Node(
name = name, name = name,
user_id = request.user.id, user_id = request.user.id,
parent_id = project_id, parent_id = project_id,
type_id = cache.NodeType['Corpus'].id, type_id = cache.NodeType['Corpus'].id,
language_id = language_id, # no default language at this point
language_id = None,
hyperdata = {'Processing' : "Parsing documents",} hyperdata = {'Processing' : "Parsing documents",}
) )
session.add(corpus) session.add(corpus)
......
import re import re
from admin.utils import PrintException from admin.utils import PrintException
from gargantext_web.db import Node, Ngram, NodeNgram,NodeNodeNgram from gargantext_web.db import Node, Ngram, NodeNgram, NodeNodeNgram, NodeNgramNgram
from gargantext_web.db import cache, session, get_or_create_node, bulk_insert from gargantext_web.db import cache, session, get_or_create_node, bulk_insert
import sqlalchemy as sa import sqlalchemy as sa
...@@ -13,73 +13,222 @@ from sqlalchemy.orm import aliased ...@@ -13,73 +13,222 @@ from sqlalchemy.orm import aliased
from ngram.tools import insert_ngrams from ngram.tools import insert_ngrams
from analysis.lists import WeightedList, UnweightedList from analysis.lists import WeightedList, UnweightedList
def exportNgramList(node,filename): from collections import defaultdict
from csv import writer, reader, QUOTE_MINIMAL
def get_id(ngram_terms):
query = session.query(Ngram.id).filter(Ngram.terms==ngram_terms).first()
return(query)
def exportNgramList(node,filename,delimiter="\t"):
# les nodes couvrant les listes
# -----------------------------
stop_node = get_or_create_node(nodetype='StopList', corpus=node) stop_node = get_or_create_node(nodetype='StopList', corpus=node)
miam_node = get_or_create_node(nodetype='MiamList', corpus=node) miam_node = get_or_create_node(nodetype='MiamList', corpus=node)
map_node = get_or_create_node(nodetype='MapList', corpus=node) map_node = get_or_create_node(nodetype='MapList', corpus=node)
group_node = get_or_create_node(nodetype='Group', corpus=node) group_node = get_or_create_node(nodetype='Group', corpus=node)
stop_ngrams = session.query(NodeNgram.ngram_id).filter(NodeNgram.node_id==stop_node.id).all()
miam_ngrams = session.query(NodeNgram.ngram_id).filter(NodeNgram.node_id==miam_node.id).all()
map_ngrams = session.query(NodeNgram.ngram_id).filter(NodeNgram.node_id==map_node.id).all()
group_ngrams= (session.query(NodeNgramNgram.ngramx_id, NodeNgramNgram.ngramy_id)
.filter(NodeNgramNgram.node_id==group_node.id)
.all()
)
all_ngrams = set()
grouped = defaultdict(lambda: defaultdict(set))
toList = list()
for ngram in group_ngrams : # listes de ngram_ids correspondantes
grouped[ngram[0]].add(ngram[1]) # ------------------------------------
all_ngrams.add(ngram[0]) #~~ contenu: liste des ids [2562,...]
all_ngrams.add(ngram[1]) stop_ngram_ids = [stop_ngram.ngram_id for stop_ngram in stop_node.node_node_ngram_collection]
# idem pour miam et map
miam_ngram_ids = [miam_ng.ngram_id for miam_ng in miam_node.node_node_ngram_collection]
map_ngram_ids = [map_ng.ngram_id for map_ng in map_node.node_node_ngram_collection]
def add_ngram(fromList, toList=toList, grouplist=grouped, all_ngrams=all_ngrams, weight=0): # pour la group_list on a des couples de ngram_ids
for ngram_id in from_list: # -------------------
all_ngrams.add(ngram_id) # ex: [(3544, 2353), (2787, 4032), ...]
if ngram_id in grouplist.keys(): group_ngram_id_couples = [(nd_ng_ng.ngramx_id,nd_ng_ng.ngramy_id) for nd_ng_ng in group_node.node_nodengramngram_collection]
ngrams.append((ngram_id, grouped[ngram_id], weight))
else :
ngram.append((ngram_id, "", weight))
add_ngrams(stop_ngrams, weight=0)
add_ngrams(miam_ngrams, weight=1)
add_ngrams(map_ngrams, weight=2)
# to csv # k couples comme set
with open(filename, "w") as f: # --------------------
f.write(ngram) for ngram in ngrams # [(a => x) (a => y)] => [a => {x,y}]
grouped = defaultdict(set)
for ngram in group_ngram_id_couples:
# /!\ just in one direction /!\
# a => {x} but not not x => {a}
grouped[ngram[0]].add(ngram[1])
# helper func
def ngrams_to_csv_rows(ngram_ids, id_groupings={}, list_type=7):
"""
Table d'infos basiques par ngram :
(ng_id, forme du terme, poids, type_de_liste)
avec une colonne supplémentaire optionnelle:
ngrams groupés avec cet id ex: "4|42"
Retourne une matrice csv_rows en liste de liste
[
[ligne1_colA, ligne1_colB..],
[ligne2_colA, ligne2_colB..],
..
]
(ensuite par exemple csv.writer.writerows(csv_rows)
"""
# récupérer d'un coup les objets Ngram (avec terme)
ng_objs = session.query(Ngram).filter(Ngram.id.in_(ngram_ids)).all()
# les transcrire en tableau (liste de listes)
csv_rows = list()
for ng_obj in ng_objs:
ng_id = ng_obj.id
if ng_id in id_groupings.keys():
this_grouped = "|".join(str(gid) for gid in id_groupings[ng_id])
else:
this_grouped = ""
# transcription : 5 colonnes
# ID , terme , n , type_de_liste , gid|gid|gid
csv_rows.append(
[ng_id,ng_obj.terms,ng_obj.n,list_type,this_grouped]
)
# csv_rows = [[ligne1_a, ligne1_b..],[ligne2_a, ligne2_b..],..]
return csv_rows
# on applique notre fonction ng_to_csv sur chaque liste
# ------------------------------------------------------
stop_csv_rows = ngrams_to_csv_rows(stop_ngram_ids,
id_groupings=grouped,
list_type=0)
# miam contient map donc il y a un préalable ici
miam_without_map = [ng for ng in miam_ngram_ids if ng not in map_ngram_ids]
miam_csv_rows = ngrams_to_csv_rows(miam_without_map,
id_groupings=grouped,
list_type=1)
map_csv_rows = ngrams_to_csv_rows(map_ngram_ids,
id_groupings=grouped,
list_type=2)
# all lists together now
this_corpus_all_rows = stop_csv_rows + miam_csv_rows + map_csv_rows
# output
with open(filename, 'w') as out_file:
# csv.writer()
csv_wr = writer(out_file,
delimiter=delimiter,
quoting=QUOTE_MINIMAL)
# write to outfile
csv_wr.writerows(this_corpus_all_rows)
def importNgramList(node,filename): def importNgramList(node,filename,delimiter="\t",modify_lists=[0,1,2]):
''' '''
Suppose Suppose une table CSV avec colonnes comme dans fonction export.
/!\ efface et remplace les listes existantes /!\
/!\ (supprime leur collection de NodeNgrams) /!\
''' '''
list_types_shortcuts = {
0: "StopList",
1: "MiamList",
2: "MapList",
}
# on supprime tous les NodeNgrams des listes à modifier
# ------------------------------------------------------
for list_shortcut in modify_lists:
# find previous listnode id
list_type = list_types_shortcuts[list_shortcut]
list_node = get_or_create_node(nodetype=list_type, corpus=node)
node_id = listnode.id
# delete previous lists
session.query(NodeNgram).filter(NodeNgram.node_id==list_node.id).delete()
session.commit()
# on lit le CSV
# --------------
ngrams_csv_rows = []
with open(filename, "r") as f: with open(filename, "r") as f:
ngrams_list = f.read().splitlines() ngrams_csv_rows = reader(f,
delimiter = delimiter,
quoting = QUOTE_MINIMAL
)
all_read_terms = list()
# for row delete others and
stop_words = set(stop_list)
stop_ids = insert_ngrams([(word, len(word.split(' '))) for word in stop_words])
stop_node = get_or_create_node(nodetype='StopList', corpus=node)
session.add(stop_node) for csv_row in ngrams_csv_rows:
session.commit() this_ng_id = csv_row[0]
this_ng_terms = csv_row[1]
this_ng_nlen = csv_row[2]
this_ng_list_type_id = csv_row[3]
this_ng_grouped_ngs = csv_row[4]
# --- quelle liste cible ?
# par ex: "MiamList"
list_type = type_ids_cache[this_ng_list_type_id]
tgt_list_node = get_or_create_node(nodetype=list_type, corpus=node)
# --- test 1: forme existante dans node_ngram ?
#preexisting = session.query(Ngram).filter(Ngram.terms == this_ng_terms).first()
#if preexisting is None:
# # todo ajouter Ngram dans la table node_ngram
# avec un nouvel ID
# --- test 2: forme déjà dans une liste ?
#if preexisting is not None:
# # premier node de type "liste" mentionnant ce ngram_id
# #
# node_ngram = preexisting.node_node_ngram_collection[0]
# previous_list = node_ngram.node_id
#
# ---------------
data[0] = tgt_list_node.id
data[1] = this_ng_id # on suppose le même ngram_id
data[2] =
size = len(list(stop_words)) size = len(list(stop_words))
data = zip(
[stop_node.id for i in range(0,size)]
, [stop_ids[word] for word in list(stop_words)]
, [-1 for i in range(0,size)]
)
bulk_insert(NodeNgram, ['node_id', 'ngram_id', 'weight'], [d for d in data]) bulk_insert(NodeNgram, ['node_id', 'ngram_id', 'weight'], [d for d in data])
# bulk_insert(NodeNgramNgram, ['node_id', 'ngramx_id', 'ngramy_id', 'weight'], [d for d in data])
# lecture des ngrams préexistants
# ------------------
# Remarque quand on a un list_node li alors faire:
# li.node_node_ngram_collection
# (donne tous les node_ngram)
# (plus rapide que lancer une nouvelle session.query)
#
# TODO utiliser carrément :
# [w.node_ngram for w in listnode.node_node_ngram_collection]
...@@ -56,6 +56,7 @@ def isStopWord(ngram, stop_words=None): ...@@ -56,6 +56,7 @@ def isStopWord(ngram, stop_words=None):
, "(.*)\d(.*)" , "(.*)\d(.*)"
, "(.*)(\.)(.*)" , "(.*)(\.)(.*)"
, "(.*)(\,)(.*)" , "(.*)(\,)(.*)"
, "(.*)(< ?/?p ?>)(.*)" # marques de paragraphes
, "(.*)(study)(.*)" , "(.*)(study)(.*)"
, "(.*)(xx|xi|xv)(.*)" , "(.*)(xx|xi|xv)(.*)"
, "(.*)(result)(.*)" , "(.*)(result)(.*)"
......
This diff is collapsed.
import re
import locale
from lxml import etree
from lxml.etree import tostring
from lxml.html import html5parser
from itertools import chain
from datetime import datetime, date
from django.utils import timezone
import dateutil.parser
import dateparser
import sys
#sys.path.append('/srv/gargantext')
#from admin.env import *
#from parsing.FileParsers.FileParser import FileParser
from .FileParser import FileParser
#from parsing.NgramsExtractors import *
from ..NgramsExtractors import *
from admin.utils import PrintException
class EuropressFileParser_en(FileParser):
def _parse(self, file):
localeEncoding = "fr_FR"
codif = "UTF-8"
format_page = re.compile('p\. .*', re.UNICODE)
def parse_date(date, lang):
d = dateparser.parse(date.strip(), languages=[lang])
return d
if isinstance(file, str):
file = open(file, 'rb')
contents = file.read()
encoding = self.detect_encoding(contents)
if encoding != "utf-8":
try:
contents = contents.decode("latin1", errors='replace').encode(codif)
except:
PrintException()
html_parser = etree.HTMLParser(encoding=codif)
html = etree.fromstring(contents, html_parser)
html_parser = html5parser.etree.HTMLParser(encoding=codif)
html = html5parser.etree.fromstring(contents, html_parser)
html_articles = html.xpath('//article')
name_xpath = "./header/div/span[@class = 'DocPublicationName']"
header_xpath = "./header/div/span[@class = 'DocHeader']"
title_xpath = "./header/div[@class='titreArticle']"
text_xpath = "./section/div[@class='DocText']//p"
def scrap_text(data_xpath):
"""
Récupère le texte de toute arborescence
sous une liste de noeuds (par ex liste de <p>)
et renvoie une liste de string
"""
result = list()
# a priori un seul titre ou plusieurs p dans data_xpath
for elem in data_xpath:
all_text = list()
# on utilise itertext pour avoir
# tous les sous éléments 1 fois
# quelque soit la profondeur
for sub_txt in elem.itertext(with_tail=True):
sub_txt_clean = sub_txt.strip()
if sub_txt_clean != '':
all_text.append(sub_txt_clean)
result.append(" ".join(all_text))
return result
# parse all the articles, one by one
try:
for html_article in html_articles:
hyperdata = {}
try:
pub_name = html_article.xpath(name_xpath)[0].text
name = pub_name.split(', ')
hyperdata['journal'] = name[0]
hyperdata['number'] = name[1]
except:
try:
hyperdata['journal'] = pub_name.strip()
except:
pass
header = html_article.xpath(header_xpath)[0].text
if header is not None:
# attention en anglais la date contient 1 ou 2 virgules
# ex: "Tuesday, November 7, 2012"
# ==> dans tous ces cas 'en' dateparser.parse
# sera lancé sur header[i:] et non header[i]
header = header.split(', ')
header = list(filter(lambda x: format_page.match(x) is None, header))
if parse_date(header[0], 'en') is not None:
date = ' '.join(header[0:])
elif parse_date(header[1], 'en') is not None:
hyperdata['rubrique'] = header[0]
date = ' '.join(header[1:])
elif parse_date(header[2], 'en') is not None:
hyperdata['rubrique'] = header[0]
date = ' '.join(header[2:])
elif parse_date(header[3], 'en') is not None:
hyperdata['rubrique'] = header[0]
date = ' '.join(header[3:])
else:
date = '2016'
try:
hyperdata['publication_date'] = dateparser.parse(date.strip(), languages=['en'])
except:
hyperdata['publication_date'] = timezone.now().strftime("%Y-%m-%d %H:%M:%S")
try:
hyperdata['publication_year'] = hyperdata['publication_date'].strftime('%Y')
hyperdata['publication_month'] = hyperdata['publication_date'].strftime('%m')
hyperdata['publication_day'] = hyperdata['publication_date'].strftime('%d')
except:
print(hyperdata['title'])
print(date)
try:
title = scrap_text(html_article.xpath(title_xpath))
hyperdata['title'] = title[0]
except:
pass
try:
text = scrap_text(html_article.xpath(text_xpath))
hyperdata['abstract'] = '\n'.join([ '<p>\n'+p_text+'</p>\n' for p_text in title[1:] + text])
except:
pass
yield hyperdata
except :
PrintException()
pass
if __name__ == "__main__":
e = EuropressFileParser()
hyperdata = e.parse(str(sys.argv[1]))
for h in hyperdata:
try:
print(h['journal'], ":", h['publication_date'])
except:
pass
import re
import locale
from lxml import etree
from lxml.etree import tostring
from lxml.html import html5parser
from itertools import chain
from datetime import datetime, date
from django.utils import timezone
import dateutil.parser
import dateparser
import sys
#sys.path.append('/srv/gargantext')
#from admin.env import *
#from parsing.FileParsers.FileParser import FileParser
from .FileParser import FileParser
#from parsing.NgramsExtractors import *
from ..NgramsExtractors import *
from admin.utils import PrintException
class EuropressFileParser_fr(FileParser):
def _parse(self, file):
localeEncoding = "fr_FR"
codif = "UTF-8"
format_date = re.compile('.*\d{4}.*', re.UNICODE)
def parse_date(date, lang):
d = dateparser.parse(date.strip(), languages=[lang])
return d
if isinstance(file, str):
file = open(file, 'rb')
contents = file.read()
encoding = self.detect_encoding(contents)
if encoding != "utf-8":
try:
contents = contents.decode("latin1", errors='replace').encode(codif)
except:
PrintException()
html_parser = etree.HTMLParser(encoding=codif)
html = etree.fromstring(contents, html_parser)
html_parser = html5parser.etree.HTMLParser(encoding=codif)
html = html5parser.etree.fromstring(contents, html_parser)
html_articles = html.xpath('//article')
name_xpath = "./header/div/span[@class = 'DocPublicationName']"
header_xpath = "./header/div/span[@class = 'DocHeader']"
title_xpath = "./header/div[@class='titreArticle']"
text_xpath = "./section/div[@class='DocText']/div[@class='docOcurrContainer']/p"
def scrap_text(data_xpath):
"""
Récupère le texte de toute arborescence
sous une liste de noeuds (par ex liste de <p>)
et renvoie une liste de string
"""
result = list()
# a priori un seul titre ou plusieurs p dans data_xpath
for elem in data_xpath:
all_text = list()
# on utilise itertext pour avoir
# tous les sous éléments 1 fois
# quelque soit la profondeur
for sub_txt in elem.itertext(with_tail=True):
sub_txt_clean = sub_txt.strip()
if sub_txt_clean != '':
all_text.append(sub_txt_clean)
result.append(" ".join(all_text))
return result
# parse all the articles, one by one
try:
for html_article in html_articles:
hyperdata = {}
try:
pub_name = html_article.xpath(name_xpath)[0].text
name = pub_name.split(', ')
hyperdata['journal'] = name[0]
hyperdata['number'] = name[1]
except:
try:
hyperdata['journal'] = pub_name.strip()
except:
pass
header = html_article.xpath(header_xpath)[0].text
if header is not None:
header = header.split(', ')
if parse_date(header[0], 'fr') is not None:
date = header[0]
elif parse_date(header[1], 'fr') is not None:
hyperdata['rubrique'] = header[0]
date = header[1]
try:
hyperdata['page'] = header[2].split(' ')[1]
except:
pass
elif parse_date(header[2], 'fr') is not None:
date = header[2]
elif parse_date(header[0], 'en') is not None:
date = ' '.join(header[0:])
elif parse_date(header[1], 'en') is not None:
date = ' '.join(header[1:])
elif parse_date(header[2], 'en') is not None:
date = ' '.join(header[2:])
try:
hyperdata['publication_date'] = dateparser.parse(date.strip(), languages=['fr', 'en'])
except:
hyperdata['publication_date'] = timezone.now().strftime("%Y-%m-%d %H:%M:%S")
hyperdata['publication_year'] = hyperdata['publication_date'].strftime('%Y')
hyperdata['publication_month'] = hyperdata['publication_date'].strftime('%m')
hyperdata['publication_day'] = hyperdata['publication_date'].strftime('%d')
#print(hyperdata['publication_date'])
try:
title = scrap_text(html_article.xpath(title_xpath))
hyperdata['title'] = title[0]
except:
pass
try:
text = scrap_text(html_article.xpath(text_xpath))
hyperdata['abstract'] = '\n'.join([ '<p>\n'+p_text+'</p>\n' for p_text in title[1:] + text])
# join([ ' <p> ' + p + ' </p> ' for p in title[1:] + text])
except:
pass
yield hyperdata
except :
PrintException()
pass
if __name__ == "__main__":
e = EuropressFileParser()
hyperdata = e.parse(str(sys.argv[1]))
for h in hyperdata:
try:
print(h['journal'], ":", h['publication_date'])
except:
pass
...@@ -82,10 +82,20 @@ class ISTex(FileParser): ...@@ -82,10 +82,20 @@ class ISTex(FileParser):
if len(hyperdata["genre"])==0: if len(hyperdata["genre"])==0:
hyperdata.pop("genre") hyperdata.pop("genre")
if "language_iso3" in hyperdata: if "language_iso3" in hyperdata:
if len(hyperdata["language_iso3"])>0: # retrieve lang if lang != [] and lang != ["unknown"]
# ---------------------------------------------------
if len(hyperdata["language_iso3"])>0 and hyperdata["language_iso3"][0] != "unknown" :
hyperdata["language_iso3"] = hyperdata["language_iso3"][0] hyperdata["language_iso3"] = hyperdata["language_iso3"][0]
# default value = eng
# possible even better: langid.classify(abstract)
else: else:
# NB 97% des docs istex sont eng donc par défaut
# ----------------------------------------------
hyperdata["language_iso3"] = "eng" hyperdata["language_iso3"] = "eng"
# (cf. api.istex.fr/document/?q=*&facet=language
# et tests langid sur les language=["unknown"])
if "publication_date" in hyperdata: if "publication_date" in hyperdata:
RealDate = hyperdata["publication_date"] RealDate = hyperdata["publication_date"]
......
...@@ -3,7 +3,7 @@ from .IsiFileParser import IsiFileParser ...@@ -3,7 +3,7 @@ from .IsiFileParser import IsiFileParser
from .JstorFileParser import JstorFileParser from .JstorFileParser import JstorFileParser
from .ZoteroFileParser import ZoteroFileParser from .ZoteroFileParser import ZoteroFileParser
from .PubmedFileParser import PubmedFileParser from .PubmedFileParser import PubmedFileParser
from .EuropressFileParser_en import EuropressFileParser_en # 2015-12-08: parser 2 en 1
from .EuropressFileParser_fr import EuropressFileParser_fr from .EuropressFileParser import EuropressFileParser
from .ISTex import ISTex from .ISTex import ISTex
from .CSVParser import CSVParser from .CSVParser import CSVParser
# import * via __init__.py
from .FileParsers import * from .FileParsers import *
parsers = { parsers = {
...@@ -6,9 +7,16 @@ parsers = { ...@@ -6,9 +7,16 @@ parsers = {
'Scopus (RIS format)' : RisFileParser, 'Scopus (RIS format)' : RisFileParser,
'Zotero (RIS format)' : ZoteroFileParser, 'Zotero (RIS format)' : ZoteroFileParser,
'Jstor (RIS format)' : JstorFileParser, 'Jstor (RIS format)' : JstorFileParser,
'Europress (French)' : EuropressFileParser,
'Europress (English)' : EuropressFileParser,
# Une seule entrée pourra remplacer les variantes French/English
# mais (TODO) il faudra juste vérifier cohérence:
# - avec DB: node_resourcetype
# - avec admin/update_corpus.py
#'Europress' : EuropressFileParser, #'Europress' : EuropressFileParser,
'Europress (French)' : EuropressFileParser_fr,
'Europress (English)' : EuropressFileParser_en,
'CSVParser' : CSVParser, 'CSVParser' : CSVParser,
'ISTex' : ISTex, 'ISTex' : ISTex,
} }
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment