Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
gargantext
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
humanities
gargantext
Commits
08fc1367
Commit
08fc1367
authored
Mar 09, 2016
by
Romain Loth
Browse files
Options
Browse Files
Download
Plain Diff
Merge branch 'refactoring-alex' into refactoring-rom
parents
3b2c4c53
119705e5
Changes
5
Hide whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
458 additions
and
0 deletions
+458
-0
list_map.py
gargantext/util/toolchain/list_map.py
+117
-0
list_stop.py
gargantext/util/toolchain/list_stop.py
+127
-0
ngrams_tools.py
gargantext/util/toolchain/ngrams_tools.py
+68
-0
score_occurrences.py
gargantext/util/toolchain/score_occurrences.py
+59
-0
score_specificity.py
gargantext/util/toolchain/score_specificity.py
+87
-0
No files found.
gargantext/util/toolchain/list_map.py
0 → 100644
View file @
08fc1367
from
gargantext.util.db
import
*
from
gargantext.util.db_cache
import
*
from
gargantext.constants
import
*
from
gargantext.models.ngrams
import
Ngram
,
NodeNgram
,
\
NodeNodeNgram
,
NodeNgramNgram
from
sqlalchemy.sql
import
func
from
sqlalchemy
import
desc
,
asc
,
or_
,
and_
,
Date
,
cast
,
select
from
sqlalchemy
import
literal_column
from
sqlalchemy.orm
import
aliased
from
gargantext.util.toolchain.ngram_tools
import
insert_ngrams
import
csv
def
compute_mapList
(
corpus_id
,
limit
=
500
,
n
=
1
,
session
=
None
):
'''
According to Specificities and stoplist,
'''
monograms_part
=
0.005
monograms_limit
=
round
(
limit
*
monograms_part
)
multigrams_limit
=
limit
-
monograms_limit
#dbg = DebugTime('Corpus #%d - computing Miam' % corpus.id)
list_main_id
=
session
.
query
(
Node
.
id
)
.
filter
(
Node
.
typename
==
"MAINLIST"
,
Node
.
parent_id
==
corpus_id
)
.
first
()
list_stop_id
=
session
.
query
(
Node
.
id
)
.
filter
(
Node
.
typename
==
"STOPLIST"
,
Node
.
parent_id
==
corpus_id
)
.
first
()
list_group_id
=
session
.
query
(
Node
.
id
)
.
filter
(
Node
.
typename
==
"GROUPLIST"
,
Node
.
parent_id
==
corpus_id
)
.
first
()
score_spec_id
=
session
.
query
(
Node
.
id
)
.
filter
(
Node
.
typename
==
"SPECIFICITY"
,
Node
.
parent_id
==
corpus_id
)
.
first
()
ListMain
=
aliased
(
NodeNgram
)
ListStop
=
aliased
(
NodeNgram
)
ListGroup
=
aliased
(
NodeNgramNgram
)
ScoreSpec
=
aliased
(
NodeNodeNgram
)
# FIXME outerjoin does not work with current SqlAlchemy
# lines below the query do the job but it can be improved
query
=
(
session
.
query
(
ScoreSpec
.
ngram_id
,
ScoreSpec
.
score
)
.
join
(
ListMain
,
ScoreSpec
.
ngram_id
==
ListMain
.
ngram_id
)
.
join
(
Ngram
,
Ngram
.
id
==
ScoreSpec
.
ngram_id
)
#.outerjoin(ListGroup, Group.ngramy_id == ScoreSpec.ngram_id)
#.outerjoin(ListStop, Stop.ngram_id == ScoreSpec.ngram_id)
.
filter
(
ListMain
.
node_id
==
list_main_id
)
#.filter(ListGroup.node_id == list_group_id)
#.filter(ListStop.node_id == list_stop_id)
.
filter
(
ScoreSpec
.
nodex_id
==
score_spec_id
)
)
top_monograms
=
(
query
.
filter
(
Ngram
.
n
==
1
)
.
order_by
(
desc
(
ScoreSpec
.
score
))
.
limit
(
monograms_limit
)
)
top_multigrams
=
(
query
.
filter
(
Ngram
.
n
>=
2
)
.
order_by
(
desc
(
ScoreSpec
.
score
))
.
limit
(
multigrams_limit
)
)
stop_ngrams
=
(
session
.
query
(
NodeNgram
.
ngram_id
)
.
filter
(
NodeNgram
.
node_id
==
list_stop_id
)
.
all
()
)
grouped_ngrams
=
(
session
.
query
(
NodeNgramNgram
.
ngramy_id
)
.
filter
(
NodeNgramNgram
.
node_id
==
list_group_id
)
.
all
()
)
list_map_id
=
session
.
query
(
Node
.
id
)
.
filter
(
Node
.
parent_id
==
corpus_id
,
Node
.
typename
==
"MAPLIST"
)
.
first
()
if
list_map_id
==
None
:
corpus
=
cache
.
Node
[
corpus_id
]
user_id
=
corpus
.
user_id
list_map
=
Node
(
name
=
"MAPLIST"
,
parent_id
=
corpus_id
,
user_id
=
user_id
,
typename
=
"MAPLIST"
)
session
.
add
(
list_map
)
session
.
commit
()
list_map_id
=
list_map
.
id
session
.
query
(
NodeNgram
)
.
filter
(
NodeNgram
.
node_id
==
list_map_id
)
.
delete
()
session
.
commit
()
data
=
zip
(
[
list_map_id
for
i
in
range
(
1
,
limit
)]
,
[
n
[
0
]
for
n
in
list
(
top_multigrams
)
+
list
(
top_monograms
)
if
(
n
[
0
],)
not
in
list
(
stop_ngrams
)
]
,
[
1
for
i
in
range
(
1
,
limit
)]
)
#print([d for d in data])
bulk_insert
(
NodeNgram
,
[
'node_id'
,
'ngram_id'
,
'weight'
],
[
d
for
d
in
data
])
dbg
.
show
(
'MapList computed'
)
gargantext/util/toolchain/list_stop.py
0 → 100644
View file @
08fc1367
from
gargantext.util.db
import
*
from
gargantext.util.db_cache
import
*
from
gargantext.constants
import
*
from
gargantext.models.users
import
User
from
gargantext.models.nodes
import
Node
from
gargantext.models.ngrams
import
Ngram
,
NodeNgram
import
re
import
sqlalchemy
as
sa
from
sqlalchemy.sql
import
func
from
sqlalchemy.orm
import
aliased
from
sqlalchemy
import
desc
,
asc
,
or_
,
and_
,
Date
,
cast
,
select
,
literal_column
#from ngram.tools import insert_ngrams
def
isStopWord
(
ngram
,
stop_words
=
None
):
'''
ngram :: (Int, String) => (ngram_id, ngram_terms)
stop_words :: Set of String
(to avoid SQL query each time isStopWord is invoked, get in as parameter)
'''
word
=
ngram
[
1
]
if
word
in
stop_words
:
return
(
True
)
def
test_match
(
word
,
regex
):
format_regex
=
re
.
compile
(
regex
)
if
format_regex
.
match
(
word
)
:
return
(
True
)
for
regex
in
[
"^.{1,2}$"
,
"(.*)
\
d(.*)"
,
"(.*)(
\
.)(.*)"
,
"(.*)(
\
,)(.*)"
,
"(.*)(< ?/?p ?>)(.*)"
# marques de paragraphes
,
"(.*)(study)(.*)"
,
"(.*)(xx|xi|xv)(.*)"
,
"(.*)(result)(.*)"
,
"(.*)(année|nombre|moitié)(.*)"
,
"(.*)(temps)(.*)"
,
"(.*)(
%
)(.*)"
,
"(.*)(
\
{)(.*)"
,
"(.*)(terme)(.*)"
,
"(.*)(différent)(.*)"
,
"(.*)(travers)(.*)"
,
"(.*)(:|
\
|)(.*)"
]
:
if
test_match
(
word
,
regex
)
is
True
:
return
(
True
)
def
create_gargantua_resources
():
gargantua_id
=
session
.
query
(
User
.
id
)
.
filter
(
User
.
username
==
"gargantua"
)
.
first
()
project
=
Node
(
name
=
"Resources"
,
user_id
=
gargantua_id
,
typename
=
"PROJECT"
)
stopList
=
Node
(
name
=
"STOPLIST"
,
parent_id
=
project
.
id
,
user_id
=
gargantua_id
,
typename
=
"STOPLIST"
)
session
.
add
(
project
)
session
.
add
(
stopList
)
session
.
commit
()
def
compute_stop
(
corpus_id
,
stopList_id
=
None
,
limit
=
2000
,
debug
=
False
):
'''
Create list of stop words.
TODO do a function to get all stop words with social scores
'''
# Get the StopList if it exist or create a new one
# At this step of development, a new StopList should be created
if
stopList_id
==
None
:
stopList_id
=
session
.
query
(
Node
.
id
)
.
filter
(
Node
.
parent_id
==
corpus_id
,
Node
.
typename
==
"STOPLIST"
)
.
first
()
if
stopList_id
==
None
:
corpus
=
cache
.
Node
[
corpus_id
]
user_id
=
corpus
.
user_id
stopList
=
Node
(
name
=
"STOPLIST"
,
parent_id
=
corpus_id
,
user_id
=
user_id
,
typename
=
"STOPLIST"
)
session
.
add
(
stopList
)
session
.
commit
()
stopList_id
=
stopList
.
id
# For tests only
if
debug
==
True
:
session
.
query
(
Node
)
.
filter
(
Node
.
id
==
stopList_id
)
.
delete
()
session
.
commit
()
# Get common resources, all common StopWords on the platform
## First get the id of the StopList of Gargantua super user
gargantua_id
=
session
.
query
(
User
.
id
)
.
filter
(
User
.
username
==
"gargantua"
)
.
first
()
rootStopList_id
=
session
.
query
(
Node
.
id
)
.
filter
(
Node
.
user_id
==
gargantua_id
,
Node
.
typename
==
"STOPLIST"
)
.
first
()
## Then get all the stop words
## stop_words :: [String]
stop_words
=
(
session
.
query
(
Ngram
.
terms
)
.
join
(
NodeNgram
,
NodeNgram
.
ngram_id
==
Ngram
.
id
)
.
filter
(
NodeNgram
.
node_id
==
rootStopList_id
)
.
all
()
)
print
([
n
for
n
in
stop_words
])
## Get the ngrams
## ngrams :: [(Int, String, Int)]
frequency
=
sa
.
func
.
count
(
NodeNgram
.
weight
)
ngrams
=
(
session
.
query
(
Ngram
.
id
,
Ngram
.
terms
,
frequency
)
.
join
(
NodeNgram
,
NodeNgram
.
ngram_id
==
Ngram
.
id
)
.
join
(
Node
,
Node
.
id
==
NodeNgram
.
node_id
)
.
filter
(
Node
.
parent_id
==
corpus_id
,
Node
.
typename
==
"DOCUMENT"
)
.
group_by
(
Ngram
.
id
)
.
order_by
(
desc
(
frequency
)
)
#.limit(limit)
.
all
()
)
ngrams_to_stop
=
filter
(
lambda
x
:
isStopWord
(
x
,
stop_words
=
stop_words
),
ngrams
)
print
([
n
for
n
in
ngrams_to_stop
])
stop
=
LISTTYPES
[
"STOPLIST"
]({
n
[
0
]
:
-
1
for
n
in
ngrams_to_stop
})
stop
.
save
(
stopList_id
)
#
gargantext/util/toolchain/ngrams_tools.py
0 → 100644
View file @
08fc1367
from
gargantext.util.db
import
*
from
gargantext.util.db_cache
import
*
from
gargantext.constants
import
*
from
gargantext.models.ngrams
import
Ngram
,
NodeNgram
,
NodeNgramNgram
def
insert_ngrams
(
ngrams
,
get
=
'terms-id'
):
'''
insert_ngrams :: [(String, Int)] -> dict[terms] = id
'''
db
,
cursor
=
get_cursor
()
cursor
.
execute
(
'''
CREATE TEMPORARY TABLE tmp__ngram (
id INT,
terms VARCHAR(255) NOT NULL,
n INT
);
'''
)
bulk_insert
(
'tmp__ngram'
,
[
'terms'
,
'n'
],
ngrams
,
cursor
=
cursor
)
cursor
.
execute
(
'''
UPDATE
tmp__ngram
SET
id = ngram.id
FROM
%
s AS ngram
WHERE
tmp__ngram.terms = ngram.terms
'''
%
(
Ngram
.
__table__
.
name
,))
cursor
.
execute
(
'''
INSERT INTO
%
s (terms, n)
SELECT
terms, n
FROM
tmp__ngram
WHERE
id IS NULL
'''
%
(
Ngram
.
__table__
.
name
,))
cursor
.
execute
(
'''
UPDATE
tmp__ngram
SET
id = ngram.id
FROM
%
s AS ngram
WHERE
ngram.terms = tmp__ngram.terms
AND
ngram.n = tmp__ngram.n
AND
tmp__ngram.id IS NULL
'''
%
(
Ngram
.
__table__
.
name
,))
ngram_ids
=
dict
()
cursor
.
execute
(
'SELECT id, terms FROM tmp__ngram'
)
for
row
in
cursor
.
fetchall
():
ngram_ids
[
row
[
1
]]
=
row
[
0
]
db
.
commit
()
return
(
ngram_ids
)
gargantext/util/toolchain/score_occurrences.py
0 → 100644
View file @
08fc1367
from
gargantext_web.db
import
get_session
,
cache
,
get_cursor
from
gargantext_web.db
import
Node
,
NodeNgram
,
NodeNodeNgram
from
gargantext_web.db
import
get_or_create_node
#from admin.utils import DebugTime
def
compute_occs
(
corpus
,
debug
=
True
):
'''
compute_occs :: Corpus -> IO ()
'''
#dbg = DebugTime('Corpus #%d - OCCURRENCES' % corpus.id)
#dbg.show('Calculate occurrences')
occs_node
=
get_or_create_node
(
nodetype
=
'Occurrences'
,
corpus
=
corpus
,
mysession
=
mysession
)
#print(occs_node.id)
(
session
.
query
(
NodeNodeNgram
)
.
filter
(
NodeNodeNgram
.
nodex_id
==
occs_node
.
id
)
.
delete
()
)
session
.
commit
()
db
,
cursor
=
get_cursor
()
cursor
.
execute
(
'''
INSERT INTO
%
s (nodex_id, nodey_id, ngram_id, score)
SELECT
%
d AS nodex_id,
%
d AS nodey_id,
nodengram.ngram_id AS ngram_id,
SUM(nodengram.weight) AS score
FROM
%
s AS nodengram
INNER JOIN
%
s AS node ON nodengram.node_id = node.id
WHERE
node.parent_id =
%
d
AND
node.type_id =
%
d
GROUP BY
nodengram.ngram_id
'''
%
(
NodeNodeNgram
.
__table__
.
name
,
occs_node
.
id
,
corpus
.
id
,
NodeNgram
.
__table__
.
name
,
Node
.
__table__
.
name
,
corpus
.
id
,
cache
.
NodeType
[
'Document'
]
.
id
)
)
db
.
commit
()
if
debug
is
True
:
data
=
session
.
query
(
NodeNodeNgram
)
.
filter
(
NodeNodeNgram
.
nodex_id
==
occs_node
.
id
)
.
all
()
print
([
n
for
n
in
data
])
gargantext/util/toolchain/score_specificity.py
0 → 100644
View file @
08fc1367
from
gargantext.util.db
import
*
from
gargantext.util.db_cache
import
*
from
gargantext.constants
import
*
from
gargantext.util.analysis.cooccurrences
import
do_cooc
from
gargantext.models.ngrams
import
Ngram
,
NodeNgram
,
\
NodeNgramNgram
,
NodeNodeNgram
import
numpy
as
np
import
pandas
as
pd
from
collections
import
defaultdict
from
sqlalchemy
import
desc
,
asc
,
or_
,
and_
,
Date
,
cast
,
select
def
specificity
(
cooc_id
=
None
,
corpus
=
None
,
limit
=
100
,
session
=
None
):
'''
Compute the specificity, simple calculus.
'''
cooccurrences
=
(
session
.
query
(
NodeNgramNgram
)
.
filter
(
NodeNgramNgram
.
node_id
==
cooc_id
)
.
order_by
(
NodeNgramNgram
.
score
)
.
limit
(
limit
)
)
matrix
=
defaultdict
(
lambda
:
defaultdict
(
float
))
for
cooccurrence
in
cooccurrences
:
matrix
[
cooccurrence
.
ngramx_id
][
cooccurrence
.
ngramy_id
]
=
cooccurrence
.
score
matrix
[
cooccurrence
.
ngramy_id
][
cooccurrence
.
ngramx_id
]
=
cooccurrence
.
score
x
=
pd
.
DataFrame
(
matrix
)
.
fillna
(
0
)
x
=
x
/
x
.
sum
(
axis
=
1
)
xs
=
x
.
sum
(
axis
=
1
)
ys
=
x
.
sum
(
axis
=
0
)
m
=
(
xs
-
ys
)
/
(
2
*
(
x
.
shape
[
0
]
-
1
))
m
=
m
.
sort
(
inplace
=
False
)
#node = get_or_create_node(nodetype='Specificity',corpus=corpus)
node
=
session
.
query
(
Node
)
.
filter
(
Node
.
parent_id
==
corpus_id
,
Node
.
typename
==
"SPECIFICITY"
)
.
first
()
if
node
==
None
:
corpus
=
cache
.
Node
[
corpus_id
]
user_id
=
corpus
.
user_id
node
=
Node
(
name
=
"SPECIFICITY"
,
parent_id
=
corpus_id
,
user_id
=
user_id
,
typename
=
"SPECIFICITY"
)
session
.
add
(
node
)
session
.
commit
()
data
=
zip
(
[
node
.
id
for
i
in
range
(
1
,
m
.
shape
[
0
])]
,
[
corpus
.
id
for
i
in
range
(
1
,
m
.
shape
[
0
])]
,
m
.
index
.
tolist
()
,
m
.
values
.
tolist
()
)
session
.
query
(
NodeNodeNgram
)
.
filter
(
NodeNodeNgram
.
nodex_id
==
node
.
id
)
.
delete
()
session
.
commit
()
bulk_insert
(
NodeNodeNgram
,
[
'nodex_id'
,
'nodey_id'
,
'ngram_id'
,
'score'
],
[
d
for
d
in
data
])
return
(
node
.
id
)
def
compute_specificity
(
corpus
,
limit
=
100
,
session
=
None
):
'''
Computing specificities as NodeNodeNgram.
All workflow is the following:
1) Compute the cooc matrix
2) Compute the specificity score, saving it in database, return its Node
'''
#dbg = DebugTime('Corpus #%d - specificity' % corpus.id)
#list_cvalue = get_or_create_node(nodetype='Cvalue', corpus=corpus)
cooc_id
=
do_cooc
(
corpus
=
corpus
,
cvalue_id
=
list_cvalue
.
id
,
limit
=
limit
)
specificity
(
cooc_id
=
cooc_id
,
corpus
=
corpus
,
limit
=
limit
,
session
=
session
)
#dbg.show('specificity')
#corpus=session.query(Node).filter(Node.id==244250).first()
#compute_specificity(corpus)
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment