Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
gargantext
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
humanities
gargantext
Commits
1852e88d
Commit
1852e88d
authored
Sep 22, 2017
by
sim
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
Notebook: add corpus_list func to retrieve termlists
parent
42602f74
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
69 additions
and
2 deletions
+69
-2
db.py
gargantext/util/db.py
+1
-0
gargantext_notebook.py
install/notebook/gargantext_notebook.py
+68
-2
No files found.
gargantext/util/db.py
View file @
1852e88d
...
...
@@ -25,6 +25,7 @@ session = scoped_session(sessionmaker(bind=engine))
########################################################################
from
sqlalchemy.orm
import
aliased
from
sqlalchemy
import
func
,
desc
from
sqlalchemy.sql.expression
import
case
########################################################################
# bulk insertions
...
...
install/notebook/gargantext_notebook.py
View file @
1852e88d
...
...
@@ -15,8 +15,9 @@ os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'gargantext.settings')
django
.
setup
()
from
gargantext.constants
import
QUERY_SIZE_N_MAX
,
get_resource
,
get_resource_by_name
from
gargantext.models
import
Node
,
ProjectNode
,
DocumentNode
from
gargantext.util.db
import
session
,
get_engine
,
func
from
gargantext.models
import
(
Node
,
ProjectNode
,
DocumentNode
,
Ngram
,
NodeNgram
,
NodeNgramNgram
)
from
gargantext.util.db
import
session
,
get_engine
,
func
,
aliased
,
case
from
collections
import
Counter
import
importlib
from
django.http
import
Http404
...
...
@@ -200,3 +201,68 @@ def run_moissonneur(moissonneur, project, name, query):
session
.
commit
()
return
corpus
ALL_LIST_TYPES
=
[
'main'
,
'map'
,
'stop'
]
def
_ngrams
(
corpus_id
,
list_types
,
entities
):
list_types
=
(
list_types
,)
if
isinstance
(
list_types
,
str
)
else
list_types
list_typenames
=
[
'{}LIST'
.
format
(
t
.
upper
())
for
t
in
list_types
if
t
in
ALL_LIST_TYPES
]
# `Node` is our list, ie. MAINLIST and/or MAPLIST and/or STOPLIST
return
(
session
.
query
(
*
entities
)
.
select_from
(
Ngram
)
.
filter
(
NodeNgram
.
ngram_id
==
Ngram
.
id
,
NodeNgram
.
node_id
==
Node
.
id
,
Node
.
parent_id
==
corpus_id
,
Node
.
typename
.
in_
(
list_typenames
)))
def
corpus_list
(
corpus_id
,
list_types
=
ALL_LIST_TYPES
,
with_synonyms
=
True
):
# Link between a GROUPLIST, a normal form (ngram1), and a synonym (ngram2)
NNN
=
NodeNgramNgram
# Get the list type from the Node type -- as in CSV export
list_type
=
(
case
([(
Node
.
typename
==
'MAINLIST'
,
'main'
),
(
Node
.
typename
==
'MAPLIST'
,
'map'
),
(
Node
.
typename
==
'STOPLIST'
,
'stop'
)])
.
label
(
'type'
))
# We will retrieve each ngram as the following tuple:
entities
=
(
list_type
,
Ngram
.
terms
.
label
(
'ng'
))
# First, get ngrams from wanted lists
ngrams
=
_ngrams
(
corpus_id
,
list_types
,
entities
)
# Secondly, exclude "synonyms" (grouped ngrams that are not normal forms).
# We have to exclude synonyms first because data is inconsistent and some
# of them can be both in GROUPLIST and in MAIN/MAP/STOP lists. We want to
# take synonyms from GROUPLIST only -- see below.
Groups
=
aliased
(
Node
,
name
=
'groups'
)
query
=
(
ngrams
.
outerjoin
(
Groups
,
(
Groups
.
parent_id
==
corpus_id
)
&
(
Groups
.
typename
==
'GROUPLIST'
))
.
outerjoin
(
NNN
,
(
NNN
.
node_id
==
Groups
.
id
)
&
(
NNN
.
ngram2_id
==
Ngram
.
id
))
.
filter
(
NNN
.
ngram1_id
==
None
))
# If `with_synonyms` is True, add them from GROUPLIST: this is the reliable
# source for them
if
with_synonyms
:
Synonym
=
aliased
(
Ngram
)
synonyms
=
(
ngrams
.
with_entities
(
list_type
,
Synonym
.
terms
.
label
(
'ng'
))
.
filter
(
NNN
.
ngram1_id
==
Ngram
.
id
,
NNN
.
ngram2_id
==
Synonym
.
id
,
NNN
.
node_id
==
Groups
.
id
,
Groups
.
parent_id
==
corpus_id
,
Groups
.
typename
==
'GROUPLIST'
))
query
=
query
.
union
(
synonyms
)
# Again, data is inconsistent: MAINLIST may intersect with MAPLIST and
# we don't wan't that
if
'main'
in
list_types
and
'map'
not
in
list_types
:
# Exclude MAPLIST ngrams from MAINLIST
entities
=
(
"'main'"
,
entities
[
1
])
query
=
query
.
except_
(
_ngrams
(
corpus_id
,
'map'
,
entities
))
# Return found ngrams sorted by list type, and then alphabetically
return
query
.
order_by
(
'type'
,
'ng'
)
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment