Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
gargantext
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
humanities
gargantext
Commits
2791e98e
Commit
2791e98e
authored
Sep 25, 2017
by
Alexandre Delanoë
Browse files
Options
Browse Files
Download
Plain Diff
Merge branch 'stable' into stable-imt
parents
4e9dc26a
0f3ecfc8
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
118 additions
and
14 deletions
+118
-14
db.py
gargantext/util/db.py
+1
-0
main.py
gargantext/util/toolchain/main.py
+15
-2
gargantext_notebook.py
install/notebook/gargantext_notebook.py
+102
-12
No files found.
gargantext/util/db.py
View file @
2791e98e
...
...
@@ -25,6 +25,7 @@ session = scoped_session(sessionmaker(bind=engine))
########################################################################
from
sqlalchemy.orm
import
aliased
from
sqlalchemy
import
func
,
desc
from
sqlalchemy.sql.expression
import
case
########################################################################
# bulk insertions
...
...
gargantext/util/toolchain/main.py
View file @
2791e98e
...
...
@@ -62,12 +62,12 @@ def parse_extract_indexhyperdata(corpus):
# apply actions
print
(
'CORPUS #
%
d'
%
(
corpus
.
id
))
corpus
.
status
(
'Docs'
,
progress
=
1
)
corpus
.
save_hyperdata
()
session
.
commit
()
parse
(
corpus
)
docs
=
corpus
.
children
(
"DOCUMENT"
)
.
count
()
print
(
'CORPUS #
%
d: parsed
%
d'
%
(
corpus
.
id
,
docs
))
extract_ngrams
(
corpus
)
...
...
@@ -242,6 +242,19 @@ def recount(corpus_id):
corpus
.
save_hyperdata
()
session
.
commit
()
# START OF KLUDGE...
from
gargantext.models
import
NodeNgram
,
DocumentNode
from
.ngrams_addition
import
index_new_ngrams
maplist_id
=
corpus
.
children
(
"MAPLIST"
)
.
first
()
.
id
ngram_ids
=
session
.
query
(
NodeNgram
.
ngram_id
.
distinct
())
indexed_ngrams
=
ngram_ids
.
join
(
DocumentNode
)
.
filter
(
DocumentNode
.
parent_id
==
corpus
.
id
)
not_indexed_ngrams
=
ngram_ids
.
filter
(
NodeNgram
.
node_id
==
maplist_id
,
~
NodeNgram
.
ngram_id
.
in_
(
indexed_ngrams
))
not_indexed_ngrams
=
[
x
[
0
]
for
x
in
not_indexed_ngrams
]
added
=
index_new_ngrams
(
not_indexed_ngrams
,
corpus
)
print
(
'RECOUNT #
%
d: [
%
s] indexed
%
s ngrams'
%
(
corpus
.
id
,
t
(),
added
))
# ...END OF KLUDGE
# -> overwrite occurrences (=> NodeNodeNgram)
occ_id
=
compute_occs
(
corpus
,
groupings_id
=
group_id
,
...
...
install/notebook/gargantext_notebook.py
View file @
2791e98e
...
...
@@ -15,8 +15,9 @@ os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'gargantext.settings')
django
.
setup
()
from
gargantext.constants
import
QUERY_SIZE_N_MAX
,
get_resource
,
get_resource_by_name
from
gargantext.models
import
Node
,
ProjectNode
,
DocumentNode
from
gargantext.util.db
import
session
,
get_engine
,
func
from
gargantext.models
import
(
Node
,
ProjectNode
,
DocumentNode
,
Ngram
,
NodeNgram
,
NodeNgramNgram
,
NodeNodeNgram
)
from
gargantext.util.db
import
session
,
get_engine
,
func
,
aliased
,
case
from
collections
import
Counter
import
importlib
from
django.http
import
Http404
...
...
@@ -53,20 +54,32 @@ def scan_hal(request):
return
hal
.
scan_results
(
request
)
def
_search_docs
(
corpus_id
,
request
):
return
(
session
.
query
(
DocumentNode
)
.
filter_by
(
parent_id
=
corpus_id
)
.
filter
(
Node
.
title_abstract
.
match
(
request
)))
def
_search_docs
(
corpus_id
,
request
,
fast
=
False
):
q
=
session
.
query
(
DocumentNode
)
.
filter_by
(
parent_id
=
corpus_id
)
# Search ngram <request> in hyperdata <field>
H
=
lambda
field
,
request
:
Node
.
hyperdata
[
field
]
.
astext
.
op
(
'~*'
)(
request
)
def
scan_gargantext
(
corpus_id
,
request
)
:
return
(
_search_docs
(
corpus_id
,
request
)
.
with_entities
(
func
.
count
(
DocumentNode
.
id
.
distinct
()))
.
one
())[
0
]
if
not
fast
:
# Only match <request> starting and ending with word boundary
# Sequence of spaces will match any sequence of spaces
request
=
'
\
s+'
.
join
(
filter
(
None
,
r'\m{}\M'
.
format
(
request
)
.
split
(
' '
)))
return
q
.
filter
(
Node
.
title_abstract
.
match
(
request
))
if
fast
else
\
q
.
filter
(
H
(
'title'
,
request
)
|
H
(
'abstract'
,
request
))
def
scan_gargantext_and_delete
(
corpus_id
,
request
):
r
=
_search_docs
(
corpus_id
,
request
)
.
delete
(
synchronize_session
=
'fetch'
)
def
scan_gargantext
(
corpus_id
,
request
,
fast
=
False
,
documents
=
False
):
query
=
_search_docs
(
corpus_id
,
request
,
fast
)
if
documents
:
return
query
.
all
()
return
query
.
with_entities
(
func
.
count
(
DocumentNode
.
id
.
distinct
()))
.
one
()[
0
]
def
scan_gargantext_and_delete
(
corpus_id
,
request
,
fast
=
False
):
r
=
_search_docs
(
corpus_id
,
request
,
fast
)
.
delete
(
synchronize_session
=
'fetch'
)
session
.
commit
()
return
r
...
...
@@ -191,3 +204,80 @@ def run_moissonneur(moissonneur, project, name, query):
session
.
commit
()
return
corpus
ALL_LIST_TYPES
=
[
'main'
,
'map'
,
'stop'
]
def
_ngrams
(
corpus_id
,
list_types
,
entities
):
list_types
=
(
list_types
,)
if
isinstance
(
list_types
,
str
)
else
list_types
list_typenames
=
[
'{}LIST'
.
format
(
t
.
upper
())
for
t
in
list_types
if
t
in
ALL_LIST_TYPES
]
# `Node` is our list, ie. MAINLIST and/or MAPLIST and/or STOPLIST
return
(
session
.
query
(
*
entities
)
.
select_from
(
Ngram
)
.
filter
(
NodeNgram
.
ngram_id
==
Ngram
.
id
,
NodeNgram
.
node_id
==
Node
.
id
,
Node
.
parent_id
==
corpus_id
,
Node
.
typename
.
in_
(
list_typenames
)))
def
corpus_list
(
corpus_id
,
list_types
=
ALL_LIST_TYPES
,
with_synonyms
=
False
,
with_count
=
False
):
# Link between a GROUPLIST, a normal form (ngram1), and a synonym (ngram2)
NNN
=
NodeNgramNgram
# Get the list type from the Node type -- as in CSV export
list_type
=
(
case
([(
Node
.
typename
==
'MAINLIST'
,
'main'
),
(
Node
.
typename
==
'MAPLIST'
,
'map'
),
(
Node
.
typename
==
'STOPLIST'
,
'stop'
)])
.
label
(
'type'
))
# We will retrieve each ngram as the following tuple:
entities
=
(
list_type
,
Ngram
.
terms
.
label
(
'ng'
))
if
with_count
:
entities
+=
(
Ngram
.
id
.
label
(
'id'
),)
# First, get ngrams from wanted lists
ngrams
=
_ngrams
(
corpus_id
,
list_types
,
entities
)
# Secondly, exclude "synonyms" (grouped ngrams that are not normal forms).
# We have to exclude synonyms first because data is inconsistent and some
# of them can be both in GROUPLIST and in MAIN/MAP/STOP lists. We want to
# take synonyms from GROUPLIST only -- see below.
Groups
=
aliased
(
Node
,
name
=
'groups'
)
query
=
(
ngrams
.
outerjoin
(
Groups
,
(
Groups
.
parent_id
==
corpus_id
)
&
(
Groups
.
typename
==
'GROUPLIST'
))
.
outerjoin
(
NNN
,
(
NNN
.
node_id
==
Groups
.
id
)
&
(
NNN
.
ngram2_id
==
Ngram
.
id
))
.
filter
(
NNN
.
ngram1_id
==
None
))
# If `with_synonyms` is True, add them from GROUPLIST: this is the reliable
# source for them
if
with_synonyms
:
Synonym
=
aliased
(
Ngram
)
ent
=
(
list_type
,
Synonym
.
terms
.
label
(
'ng'
),
Synonym
.
id
.
label
(
'id'
))
synonyms
=
(
ngrams
.
with_entities
(
*
ent
)
.
filter
(
NNN
.
ngram1_id
==
Ngram
.
id
,
NNN
.
ngram2_id
==
Synonym
.
id
,
NNN
.
node_id
==
Groups
.
id
,
Groups
.
parent_id
==
corpus_id
,
Groups
.
typename
==
'GROUPLIST'
))
query
=
query
.
union
(
synonyms
)
# Again, data is inconsistent: MAINLIST may intersect with MAPLIST and
# we don't wan't that
if
'main'
in
list_types
and
'map'
not
in
list_types
:
# Exclude MAPLIST ngrams from MAINLIST
query
=
query
.
except_
(
_ngrams
(
corpus_id
,
'map'
,
entities
))
if
with_count
:
N
=
query
.
subquery
()
return
(
session
.
query
(
N
.
c
.
type
,
N
.
c
.
ng
,
NodeNodeNgram
.
score
)
.
join
(
Node
,
(
Node
.
parent_id
==
corpus_id
)
&
(
Node
.
typename
==
'OCCURRENCES'
))
.
outerjoin
(
NodeNodeNgram
,
(
NodeNodeNgram
.
ngram_id
==
N
.
c
.
id
)
&
(
NodeNodeNgram
.
node1_id
==
Node
.
id
)
&
(
NodeNodeNgram
.
node2_id
==
corpus_id
)))
# Return found ngrams sorted by list type, and then alphabetically
return
query
.
order_by
(
'type'
,
'ng'
)
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment