Commit d5a8e664 authored by Romain Loth's avatar Romain Loth

[WIP] importExport: import ~ ok; export à finir

parent ee9015d3
...@@ -14,8 +14,15 @@ from ngram.tools import insert_ngrams ...@@ -14,8 +14,15 @@ from ngram.tools import insert_ngrams
from analysis.lists import WeightedList, UnweightedList from analysis.lists import WeightedList, UnweightedList
from collections import defaultdict from collections import defaultdict
from csv import writer, reader, QUOTE_MINIMAL
def exportNgramList(node,filename):
def get_id(ngram_terms):
query = session.query(Ngram.id).filter(Ngram.terms==ngram_terms).first()
return(query)
def exportNgramList(node,filename,delimiter="\t"):
# les nodes couvrant les listes # les nodes couvrant les listes
# ----------------------------- # -----------------------------
...@@ -23,92 +30,205 @@ def exportNgramList(node,filename): ...@@ -23,92 +30,205 @@ def exportNgramList(node,filename):
miam_node = get_or_create_node(nodetype='MiamList', corpus=node) miam_node = get_or_create_node(nodetype='MiamList', corpus=node)
map_node = get_or_create_node(nodetype='MapList', corpus=node) map_node = get_or_create_node(nodetype='MapList', corpus=node)
group_node = get_or_create_node(nodetype='Group', corpus=node) group_node = get_or_create_node(nodetype='Group', corpus=node)
# listes de ngram_ids correspondantes # listes de ngram_ids correspondantes
# ------------------------------------ # ------------------------------------
#~~ contenu: liste des ids [2562,...] #~~ contenu: liste des ids [2562,...]
stop_ngrams_ids = [stop_ngram.ngram_id for stop_ngram in stop_node.node_node_ngram_collection] stop_ngram_ids = [stop_ngram.ngram_id for stop_ngram in stop_node.node_node_ngram_collection]
# idem pour miam et map # idem pour miam et map
miam_ngrams_ids = [miam_ng.ngram_id for miam_ng in miam_node.node_node_ngram_collection] miam_ngram_ids = [miam_ng.ngram_id for miam_ng in miam_node.node_node_ngram_collection]
map_ngrams_ids = [map_ng.ngram_id for map_ng in map_node.node_node_ngram_collection] map_ngram_ids = [map_ng.ngram_id for map_ng in map_node.node_node_ngram_collection]
# union des listes (est-elle nécessaire ?)
all_ngrams = set(
set(stop_ngrams_ids)
| set(map_ngrams_ids)
| set(miam_ngrams_ids)
)
# pour la group_list on a des couples de ngram_ids # pour la group_list on a des couples de ngram_ids
# ------------------- # -------------------
# ex: [(3544, 2353), (2787, 4032), ...] # ex: [(3544, 2353), (2787, 4032), ...]
group_ngrams_id_couples = [(nd_ng_ng.ngramx_id,nd_ng_ng.ngramy_id) for nd_ng_ng in group_node.node_nodengramngram_collection] group_ngram_id_couples = [(nd_ng_ng.ngramx_id,nd_ng_ng.ngramy_id) for nd_ng_ng in group_node.node_nodengramngram_collection]
# k couples comme set # k couples comme set
# -------------------- # --------------------
# [(a => x) (a => y)] => [a => {x,y}] # [(a => x) (a => y)] => [a => {x,y}]
grouped = defaultdict(set) grouped = defaultdict(set)
for ngram in group_ngrams : for ngram in group_ngram_id_couples:
# /!\ just in one direction /!\
# a => {x} but not not x => {a}
grouped[ngram[0]].add(ngram[1]) grouped[ngram[0]].add(ngram[1])
all_ngrams.add(ngram[0])
all_ngrams.add(ngram[1])
toList = list()
# pour récupérer les objets Ngram (avec terme) # helper func
# ------------------------------- def ngrams_to_csv_rows(ngram_ids, id_groupings={}, list_type=7):
# session.query(Ngram).filter(Ngram.id.in_(stop_ngrams_ids)).all() """
Table d'infos basiques par ngram :
(ng_id, forme du terme, poids, type_de_liste)
avec une colonne supplémentaire optionnelle:
ngrams groupés avec cet id ex: "4|42"
Retourne une matrice csv_rows en liste de liste
[
[ligne1_colA, ligne1_colB..],
[ligne2_colA, ligne2_colB..],
..
]
(ensuite par exemple csv.writer.writerows(csv_rows)
"""
# récupérer d'un coup les objets Ngram (avec terme)
ng_objs = session.query(Ngram).filter(Ngram.id.in_(ngram_ids)).all()
# les transcrire en tableau (liste de listes)
csv_rows = list()
for ng_obj in ng_objs:
ng_id = ng_obj.id
if ng_id in id_groupings.keys():
this_grouped = "|".join(str(gid) for gid in id_groupings[ng_id])
else:
this_grouped = ""
# transcription : 5 colonnes
# ID , terme , n , type_de_liste , gid|gid|gid
csv_rows.append(
[ng_id,ng_obj.terms,ng_obj.n,list_type,this_grouped]
)
# csv_rows = [[ligne1_a, ligne1_b..],[ligne2_a, ligne2_b..],..]
return csv_rows
# on applique notre fonction ng_to_csv sur chaque liste
# ------------------------------------------------------
stop_csv_rows = ngrams_to_csv_rows(stop_ngram_ids,
id_groupings=grouped,
list_type=0)
# miam contient map donc il y a un préalable ici
miam_without_map = [ng for ng in miam_ngram_ids if ng not in map_ngram_ids]
miam_csv_rows = ngrams_to_csv_rows(miam_without_map,
id_groupings=grouped,
list_type=1)
map_csv_rows = ngrams_to_csv_rows(map_ngram_ids,
id_groupings=grouped,
list_type=2)
# all lists together now
this_corpus_all_rows = stop_csv_rows + miam_csv_rows + map_csv_rows
# output
with open(filename, 'w') as out_file:
# csv.writer()
csv_wr = writer(out_file,
delimiter=delimiter,
quoting=QUOTE_MINIMAL)
# write to outfile
csv_wr.writerows(this_corpus_all_rows)
# in_ => OUTER JOIN préalable ?
#~ def add_ngram(fromList, toList=toList, grouplist=grouped, all_ngrams=all_ngrams, weight=0):
#~ for ngram_id in from_list:
#~ all_ngrams.add(ngram_id)
#~ if ngram_id in grouplist.keys():
#~ ngrams.append((ngram_id, grouped[ngram_id], weight))
#~ else :
#~ ngram.append((ngram_id, "", weight))
#~
#~ add_ngrams(stop_ngrams, weight=0)
#~ add_ngrams(miam_ngrams, weight=1)
#~ add_ngrams(map_ngrams, weight=2)
# to csv
with open(filename, "w") as f:
for ngram in ngrams:
f.write(ngram)
def importNgramList(node,filename): def importNgramList(node,filename,delimiter="\t",modify_lists=[0,1,2]):
''' '''
Suppose Suppose une table CSV avec colonnes comme dans fonction export.
/!\ efface et remplace les listes existantes /!\
/!\ (supprime leur collection de NodeNgrams) /!\
''' '''
list_types_shortcuts = {
0: "StopList",
1: "MiamList",
2: "MapList",
}
# on supprime tous les NodeNgrams des listes à modifier
# ------------------------------------------------------
for list_shortcut in modify_lists:
# find previous listnode id
list_type = list_types_shortcuts[list_shortcut]
list_node = get_or_create_node(nodetype=list_type, corpus=node)
node_id = listnode.id
# delete previous lists
session.query(NodeNgram).filter(NodeNgram.node_id==list_node.id).delete()
session.commit()
# on lit le CSV
# --------------
ngrams_csv_rows = []
with open(filename, "r") as f: with open(filename, "r") as f:
ngrams_list = f.read().splitlines() ngrams_csv_rows = reader(f,
delimiter = delimiter,
quoting = QUOTE_MINIMAL
)
all_read_terms = list()
# for row delete others and
stop_words = set(stop_list)
stop_ids = insert_ngrams([(word, len(word.split(' '))) for word in stop_words])
stop_node = get_or_create_node(nodetype='StopList', corpus=node)
session.add(stop_node) for csv_row in ngrams_csv_rows:
session.commit() this_ng_id = csv_row[0]
this_ng_terms = csv_row[1]
this_ng_nlen = csv_row[2]
this_ng_list_type_id = csv_row[3]
this_ng_grouped_ngs = csv_row[4]
# --- quelle liste cible ?
# par ex: "MiamList"
list_type = type_ids_cache[this_ng_list_type_id]
tgt_list_node = get_or_create_node(nodetype=list_type, corpus=node)
# --- test 1: forme existante dans node_ngram ?
#preexisting = session.query(Ngram).filter(Ngram.terms == this_ng_terms).first()
#if preexisting is None:
# # todo ajouter Ngram dans la table node_ngram
# avec un nouvel ID
# --- test 2: forme déjà dans une liste ?
#if preexisting is not None:
# # premier node de type "liste" mentionnant ce ngram_id
# #
# node_ngram = preexisting.node_node_ngram_collection[0]
# previous_list = node_ngram.node_id
#
# ---------------
data[0] = tgt_list_node.id
data[1] = this_ng_id # on suppose le même ngram_id
data[2] =
size = len(list(stop_words)) size = len(list(stop_words))
data = zip(
[stop_node.id for i in range(0,size)]
, [stop_ids[word] for word in list(stop_words)]
, [-1 for i in range(0,size)]
)
bulk_insert(NodeNgram, ['node_id', 'ngram_id', 'weight'], [d for d in data]) bulk_insert(NodeNgram, ['node_id', 'ngram_id', 'weight'], [d for d in data])
# bulk_insert(NodeNgramNgram, ['node_id', 'ngramx_id', 'ngramy_id', 'weight'], [d for d in data])
# lecture des ngrams préexistants
# ------------------
# Remarque quand on a un list_node li alors faire:
# li.node_node_ngram_collection
# (donne tous les node_ngram)
# (plus rapide que lancer une nouvelle session.query)
#
# TODO utiliser carrément :
# [w.node_ngram for w in listnode.node_node_ngram_collection]
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment