Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
gargantext
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
humanities
gargantext
Commits
844de0c2
Commit
844de0c2
authored
Mar 04, 2016
by
delanoe
Browse files
Options
Browse Files
Download
Plain Diff
Merge branch 'refactoring-rom' into refactoring-alex
parents
85f8dd96
32495844
Changes
4
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
164 additions
and
7 deletions
+164
-7
lists.py
gargantext/util/lists.py
+29
-7
__init__.py
gargantext/util/toolchain/__init__.py
+5
-0
group.py
gargantext/util/toolchain/group.py
+122
-0
parsing.py
gargantext/util/toolchain/parsing.py
+8
-0
No files found.
gargantext/util/lists.py
View file @
844de0c2
...
...
@@ -70,8 +70,10 @@ class _BaseClass:
class
Translations
(
_BaseClass
):
def
__init__
(
self
,
source
=
None
):
def
__init__
(
self
,
source
=
None
,
just_items
=
False
):
self
.
items
=
defaultdict
(
int
)
# TODO lazyinit for groups
# (not necessary for save)
self
.
groups
=
defaultdict
(
set
)
if
source
is
None
:
return
...
...
@@ -83,15 +85,35 @@ class Translations(_BaseClass):
.
filter
(
NodeNgramNgram
.
node_id
==
source
)
)
self
.
items
.
update
(
query
)
for
key
,
value
in
self
.
items
.
items
():
self
.
groups
[
value
]
.
add
(
key
)
if
not
just_items
:
for
key
,
value
in
self
.
items
.
items
():
self
.
groups
[
value
]
.
add
(
key
)
elif
isinstance
(
source
,
Translations
):
self
.
items
.
update
(
source
.
items
)
self
.
groups
.
update
(
source
.
groups
)
if
not
just_items
:
self
.
groups
.
update
(
source
.
groups
)
elif
hasattr
(
source
,
'__iter__'
):
# not very intuitive with update here:
# /!\ source must be "reversed" (like self.items)
# bad exemple
# In > couples = [(1, 2), (1, 3)]
# In > tlko = Translations(couples)
# Out> Translations {1: 3}
# In > tlko.save()
# DB-- 3 -> 1
# good exemple
# In > reversed_couples = [(2, 1), (3, 1)]
# In > tlok = Translations(reversed_couples)
# Out> Translations {2: 1, 3: 1}
# In > tlok.save()
# DB-- 1 -> 2
# DB-- 1 -> 3
self
.
items
.
update
(
source
)
for
key
,
value
in
self
.
items
.
items
():
self
.
groups
[
value
]
.
add
(
key
)
if
not
just_items
:
for
key
,
value
in
self
.
items
.
items
():
self
.
groups
[
value
]
.
add
(
key
)
else
:
raise
TypeError
...
...
@@ -138,7 +160,7 @@ class Translations(_BaseClass):
# insert new data
bulk_insert
(
NodeNgramNgram
,
(
'node_id'
,
'ngram2_id'
,
'ngram1_id'
,
'
score
'
),
(
'node_id'
,
'ngram2_id'
,
'ngram1_id'
,
'
weight
'
),
((
node_id
,
key
,
value
,
1.0
)
for
key
,
value
in
self
.
items
.
items
())
)
...
...
gargantext/util/toolchain/__init__.py
View file @
844de0c2
...
...
@@ -5,6 +5,7 @@ from .ngrams_extraction import extract_ngrams
from
gargantext.util.db
import
session
from
gargantext.models
import
Node
from
.group
import
compute_groups
def
parse_extract
(
corpus
):
# retrieve corpus from database from id
...
...
@@ -20,3 +21,7 @@ def parse_extract(corpus):
print
(
'CORPUS #
%
d: parsed'
%
(
corpus
.
id
))
extract_ngrams
(
corpus
)
print
(
'CORPUS #
%
d: extracted ngrams'
%
(
corpus
.
id
))
# temporary ngram lists workflow
group_id
=
compute_groups
(
corpus
)
print
(
'CORPUS #
%
d: new grouplist = #
%
i'
%
(
corpus
.
id
,
group_id
))
gargantext/util/toolchain/group.py
0 → 100644
View file @
844de0c2
from
gargantext.models
import
Node
,
NodeNgramNgram
from
gargantext.util.db
import
session
from
gargantext.util.lists
import
Translations
# to convert fr => french :/
from
gargantext.util.languages
import
languages
from
nltk.stem.snowball
import
SnowballStemmer
from
re
import
split
as
resplit
from
collections
import
defaultdict
,
Counter
def
prepare_stemmers
(
corpus
):
"""
Returns *several* stemmers (one for each language in the corpus)
(as a dict of stemmers with key = language_iso2)
"""
stemmers_by_lg
=
{
# always get a generic stemmer in case language code unknown
'__unknown__'
:
SnowballStemmer
(
"english"
)
}
for
lgiso2
in
corpus
.
hyperdata
[
'languages'
]
.
keys
():
lgname
=
languages
[
lgiso2
]
.
name
.
lower
()
stemmers_by_lg
[
lgiso2
]
=
SnowballStemmer
(
lgname
)
return
stemmers_by_lg
def
compute_groups
(
corpus
,
stoplist_id
=
None
):
"""
1) Use a stemmer/lemmatizer to group forms if they have same stem/lemma
2) Create an empty GROUPLIST node (for a list of "synonym" ngrams)
3) Save the list to DB (list node + each grouping as listnode - ngram1 - ngram2)
"""
stop_ngrams_ids
=
{}
# we will need the ngrams of the stoplist to filter
if
stoplist_id
is
not
None
:
for
id
in
session
.
query
(
NodeNgram
.
id
)
.
filter
(
NodeNgram
.
node_id
==
stoplist_id
)
.
all
():
stop_ngrams_ids
[
id
[
0
]]
=
True
# 1) compute stems/lemmas
# and group if same stem/lemma
stemmers
=
prepare_stemmers
(
corpus
)
# todo dict {lg => {ngrams_todo} }
todo_ngrams_per_lg
=
defaultdict
(
set
)
# res dict { commonstem: {ngram_1:freq_1 ,ngram_2:freq_2 ,ngram_3:freq_3} }
my_groups
=
defaultdict
(
Counter
)
# preloop per doc to sort ngrams by language
for
doc
in
corpus
.
children
():
if
(
'language_iso2'
in
doc
.
hyperdata
):
lgid
=
doc
.
hyperdata
[
'language_iso2'
]
else
:
lgid
=
"__unknown__"
# doc.ngrams is an sql query (ugly but useful intermediate step)
# FIXME: move the counting and stoplist filtering up here
for
ngram_pack
in
doc
.
ngrams
.
all
():
todo_ngrams_per_lg
[
lgid
]
.
add
(
ngram_pack
)
# --------------------
# long loop per ngrams
for
(
lgid
,
todo_ngs
)
in
todo_ngrams_per_lg
.
items
():
# fun: word::str => stem::str
stem_it
=
stemmers
[
lgid
]
.
stem
for
ng
in
todo_ngs
:
doc_wei
=
ng
[
0
]
ngram
=
ng
[
1
]
# Ngram obj
# break if in STOPLIST
if
ngram
.
id
in
stop_ngrams_ids
:
next
lexforms
=
[
lexunit
for
lexunit
in
resplit
(
r'\W+'
,
ngram
.
terms
)]
# STEM IT, and this term's stems will become a new grouping key...
stemseq
=
" "
.
join
([
stem_it
(
lexfo
)
for
lexfo
in
lexforms
])
# ex:
# groups['post'] = {'poste':3, 'poster':5, 'postés':2...}
# groups['copper engrav'] = {'copper engraving':3, 'coppers engraver':1...}
my_groups
[
stemseq
][
ngram
.
id
]
+=
doc_wei
del
todo_ngrams_per_lg
# now serializing all groups to a list of couples
ng_couples
=
[]
addcouple
=
ng_couples
.
append
for
grped_ngramids
in
my_groups
.
values
():
if
len
(
grped_ngramids
)
>
1
:
# first find most frequent term in the counter
winner_id
=
grped_ngramids
.
most_common
(
1
)[
0
][
0
]
for
ngram_id
in
grped_ngramids
:
if
ngram_id
!=
winner_id
:
addcouple
((
winner_id
,
ngram_id
))
del
my_groups
# 2) Create the list node
the_group
=
Node
()
the_group
.
typename
=
"GROUPLIST"
the_group
.
name
=
"Group (src:
%
s)"
%
corpus
.
name
[
0
:
10
]
the_group
.
parent_id
=
corpus
.
id
# could use corpus.parent_id if free list
the_group
.
user_id
=
corpus
.
user_id
# and save the node
session
.
add
(
the_group
)
session
.
commit
()
the_id
=
the_group
.
id
# 3) Save each grouping couple to DB thanks to Translations.save() table
ndngng_list
=
Translations
(
[(
sec
,
prim
)
for
(
prim
,
sec
)
in
ng_couples
],
just_items
=
True
)
# ...referring to the list node we just got
ndngng_list
.
save
(
the_id
)
return
the_id
gargantext/util/toolchain/parsing.py
View file @
844de0c2
...
...
@@ -2,11 +2,16 @@ from gargantext.util.db import *
from
gargantext.models
import
*
from
gargantext.constants
import
*
from
collections
import
defaultdict
def
parse
(
corpus
):
try
:
documents_count
=
0
corpus
.
status
(
'parsing'
,
progress
=
0
)
# will gather info about languages
observed_languages
=
defaultdict
(
int
)
# retrieve resource information
for
resource
in
corpus
.
resources
():
# information about the resource
...
...
@@ -22,6 +27,7 @@ def parse(corpus):
hyperdata
=
hyperdata
,
)
session
.
add
(
document
)
observed_languages
[
hyperdata
[
"language_iso2"
]]
+=
1
if
documents_count
%
BATCH_PARSING_SIZE
==
0
:
corpus
.
status
(
'parsing'
,
progress
=
documents_count
)
corpus
.
save_hyperdata
()
...
...
@@ -29,6 +35,8 @@ def parse(corpus):
documents_count
+=
1
# update info about the resource
resource
[
'extracted'
]
=
True
# add a corpus-level info about languages
corpus
.
hyperdata
[
'languages'
]
=
observed_languages
# commit all changes
corpus
.
status
(
'parsing'
,
progress
=
documents_count
,
complete
=
True
)
corpus
.
save_hyperdata
()
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment