Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
gargantext
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
humanities
gargantext
Commits
36cd53ef
Commit
36cd53ef
authored
Jun 13, 2016
by
Romain Loth
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
import terms table and retrieve ngram ids
parent
3dc26117
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
173 additions
and
3 deletions
+173
-3
ngramlists_tools.py
gargantext/util/ngramlists_tools.py
+173
-3
No files found.
gargantext/util/ngramlists_tools.py
View file @
36cd53ef
...
...
@@ -2,12 +2,16 @@
Tools to work with ngramlists (MAINLIST, MAPLIST, STOPLIST)
- query_list(list_id) to retrieve ngrams
-
im
port_ngramlists(corpus_id)
-
ex
port_ngramlists(corpus_id)
"""
from
gargantext.util.group_tools
import
query_groups
,
group_union
from
gargantext.util.db
import
session
,
bulk_insert
,
desc
,
func
from
gargantext.models
import
Ngram
,
NodeNgram
,
NodeNodeNgram
,
NodeNgramNgram
from
gargantext.util.db
import
session
,
desc
,
func
,
\
bulk_insert_ifnotexists
from
gargantext.models
import
Ngram
,
NodeNgram
,
NodeNodeNgram
,
\
NodeNgramNgram
from
gargantext.util.lists
import
UnweightedList
,
Translations
from
sqlalchemy.sql
import
exists
from
os
import
path
...
...
@@ -282,3 +286,169 @@ def export_ngramlists(node,fname=None,delimiter="\t",titles=False):
out_file
.
close
()
print
(
"EXPORT: wrote
%
i ngrams to CSV file '
%
s'"
%
(
len
(
this_corpus_all_rows
),
path
.
abspath
(
fname
)))
def
import_ngramlists
(
fname
,
delimiter
=
'
\t
'
,
group_delimiter
=
'|'
):
'''
This function reads a CSV of an ngrams table for a Corpus,
then it converts old ngram_ids to those of the current DB
then recreates an equivalent set of MAINLIST, MAPLIST, STOPLIST + GROUPS
Input example:
oldid | term |nwords| ltype |group_oldids
-------+---------------+------+--------+---------------
3842 water table 2 map 3724
3724 water tables 2 map
4277 water supply 2 map 190362|13415
13415 water supplies 2 map
190362 water-supply 1 map
20489 wastewater 1 map
Output: 3 x UnweightedList + 1 x Translations
@param fname a filename
@param delimiter a character used as separator in the CSV
@param group_delimiter a character used as grouped subforms separator
(in the last column)
The conversion of old_id to ngram_id works in 2 steps:
=> look up each term str in the DB with bulk_insert_ifnotexists
(creates absent ngrams if necessary)
=> use the new ids to map the relations involving the old ones
NB: To merge the imported lists into a corpus node's lists,
chain this function with merge_ngramlists()
'''
# --------------
# READ CSV
# --------------
# main storage for the ngrams by list
import_nodes_ngrams
=
{
'stop'
:[],
'main'
:[],
'map'
:[]}
# separate storage for the term's couples [(term str, nwords int),...]
imported_ngrams_dbdata
=
[]
# and all the old ids, by term (for id lookup after dbdata bulk_insert)
imported_ngrams_oldids
=
{}
# and for the imported_grouping list of couples [(x1,y1),(x1,y2),(x2,y3),..]
imported_groupings
=
[]
# /!\ imported_grouping contains only external ids (aka oldids)
# (ie imported ids.. that will have to be translated
# to target db ids)
fh
=
open
(
fname
,
"r"
)
ngrams_csv_rows
=
reader
(
fh
,
delimiter
=
delimiter
,
quoting
=
QUOTE_MINIMAL
)
n_read_lines
=
0
# load CSV + initial checks
for
i
,
csv_row
in
enumerate
(
ngrams_csv_rows
):
print
(
"---------------READ LINE
%
i"
%
i
)
try
:
this_ng_oldid
=
str
(
csv_row
[
0
])
this_ng_term
=
str
(
csv_row
[
1
])
this_ng_nwords
=
int
(
csv_row
[
2
])
this_list_type
=
str
(
csv_row
[
3
])
this_ng_group
=
str
(
csv_row
[
4
])
except
:
if
i
==
0
:
print
(
"WARN: (skip line) probable header line at CSV
%
s:l.0"
%
fname
)
continue
# --- term checking
if
not
len
(
this_ng_term
)
>
0
:
print
(
"WARN: (skip line) empty term at CSV
%
s:l.
%
i"
%
(
fname
,
i
))
continue
# --- check before any old ID retrieve
if
not
match
(
"
\
d+$"
,
this_ng_oldid
):
print
(
"WARN: (skip line) bad ID at CSV
%
s:l.
%
i"
%
(
fname
,
i
))
continue
else
:
this_ng_oldid
=
int
(
this_ng_oldid
)
# --- check correct list type
if
not
this_list_type
in
[
'stop'
,
'main'
,
'map'
]:
print
(
"WARN: (skip line) wrong list type at CSV
%
s:l.
%
i"
%
(
fname
,
i
))
continue
# ================= Store the data ====================
# the ngram data
imported_ngrams_dbdata
.
append
([
this_ng_term
,
this_ng_nwords
])
imported_ngrams_oldids
[
this_ng_term
]
=
this_ng_oldid
# and the "list to ngram" relation
import_nodes_ngrams
[
this_list_type
]
.
append
(
this_ng_oldid
)
# ====== Store synonyms from the import (if any) ======
if
len
(
this_ng_group
)
!=
0
:
group_as_external_ids
=
this_ng_group
.
split
(
'|'
)
for
external_subform_id
in
group_as_external_ids
:
external_subform_id
=
int
(
external_subform_id
)
imported_groupings
.
append
(
(
this_ng_oldid
,
external_subform_id
)
)
# fyi
n_read_lines
+=
1
# end of CSV read
fh
.
close
()
# ======== ngram save + id lookup =========
# returns a dict {term => id}
new_ngrams_ids
=
bulk_insert_ifnotexists
(
model
=
Ngram
,
uniquekey
=
'terms'
,
fields
=
(
'terms'
,
'n'
),
data
=
imported_ngrams_dbdata
)
del
imported_ngrams_dbdata
# loop on old ngrams and create direct mapping old_id => new_id
old_to_new_id_map
=
{}
for
term
,
oldid
in
imported_ngrams_oldids
.
items
():
old_to_new_id_map
[
oldid
]
=
new_ngrams_ids
[
term
]
del
new_ngrams_ids
del
imported_ngrams_oldids
print
(
old_to_new_id_map
)
print
(
import_nodes_ngrams
)
# ======== Import into lists =========
# 3 x abstract lists
new_lists
=
{
'map'
:
UnweightedList
(),
'main'
:
UnweightedList
(),
'stop'
:
UnweightedList
()
}
for
list_type
in
import_nodes_ngrams
:
for
old_id
in
import_nodes_ngrams
[
list_type
]:
new_id
=
old_to_new_id_map
[
old_id
]
# add to the abstract list
new_lists
[
list_type
]
.
items
.
add
(
new_id
)
# ======== Synonyms =========
new_groups
=
Translations
()
for
(
x
,
y
)
in
imported_groupings
:
new_mainform_id
=
old_to_new_id_map
[
x
]
new_subform_id
=
old_to_new_id_map
[
y
]
# /!\ Translations use (subform => mainform) order
new_groups
.
items
[
new_subform_id
]
=
new_mainform_id
# ------------------------------------------------------------------
print
(
"IMPORT: read
%
i lines from the CSV"
%
n_read_lines
)
return
(
new_lists
,
new_groups
)
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment