Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
gargantext
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
humanities
gargantext
Commits
4c9cb5ea
Commit
4c9cb5ea
authored
Nov 06, 2015
by
delanoe
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
[FEAT] Keep node.
parent
e02cd586
Changes
7
Hide whitespace changes
Inline
Side-by-side
Showing
7 changed files
with
153 additions
and
134 deletions
+153
-134
functions.py
analysis/functions.py
+9
-12
init.py
init.py
+1
-1
group.py
ngram/group.py
+20
-56
mapList.py
ngram/mapList.py
+59
-17
stop.py
ngram/stop.py
+24
-37
tools.py
ngram/tools.py
+27
-0
workflow.py
ngram/workflow.py
+13
-11
No files found.
analysis/functions.py
View file @
4c9cb5ea
...
@@ -29,7 +29,6 @@ from sqlalchemy.orm import aliased
...
@@ -29,7 +29,6 @@ from sqlalchemy.orm import aliased
def
diag_null
(
x
):
def
diag_null
(
x
):
return
x
-
x
*
scipy
.
eye
(
x
.
shape
[
0
])
return
x
-
x
*
scipy
.
eye
(
x
.
shape
[
0
])
def
do_distance
(
cooc_id
,
field1
=
None
,
field2
=
None
,
isMonopartite
=
True
):
def
do_distance
(
cooc_id
,
field1
=
None
,
field2
=
None
,
isMonopartite
=
True
):
'''
'''
do_distance :: Int -> (Graph, Partition, {ids}, {weight})
do_distance :: Int -> (Graph, Partition, {ids}, {weight})
...
@@ -75,10 +74,10 @@ def do_distance(cooc_id, field1=None, field2=None, isMonopartite=True):
...
@@ -75,10 +74,10 @@ def do_distance(cooc_id, field1=None, field2=None, isMonopartite=True):
n
=
n
.
sort
(
inplace
=
False
)
n
=
n
.
sort
(
inplace
=
False
)
m
=
m
.
sort
(
inplace
=
False
)
m
=
m
.
sort
(
inplace
=
False
)
nodes_included
=
3
00
#int(round(size/20,0))
nodes_included
=
5
00
#int(round(size/20,0))
#nodes_excluded = int(round(size/10,0))
#nodes_excluded = int(round(size/10,0))
nodes_specific
=
3
00
#int(round(size/10,0))
nodes_specific
=
5
00
#int(round(size/10,0))
#nodes_generic = int(round(size/10,0))
#nodes_generic = int(round(size/10,0))
# TODO use the included score for the node size
# TODO use the included score for the node size
...
@@ -87,6 +86,7 @@ def do_distance(cooc_id, field1=None, field2=None, isMonopartite=True):
...
@@ -87,6 +86,7 @@ def do_distance(cooc_id, field1=None, field2=None, isMonopartite=True):
#m_index = pd.Index.intersection(x.index, m.index[:nodes_generic])
#m_index = pd.Index.intersection(x.index, m.index[:nodes_generic])
# Specific:
# Specific:
m_index
=
pd
.
Index
.
intersection
(
x
.
index
,
m
.
index
[
-
nodes_specific
:])
m_index
=
pd
.
Index
.
intersection
(
x
.
index
,
m
.
index
[
-
nodes_specific
:])
#m_index = pd.Index.intersection(x.index, n.index[:nodes_included])
x_index
=
pd
.
Index
.
union
(
n_index
,
m_index
)
x_index
=
pd
.
Index
.
union
(
n_index
,
m_index
)
xx
=
x
[
list
(
x_index
)]
.
T
[
list
(
x_index
)]
xx
=
x
[
list
(
x_index
)]
.
T
[
list
(
x_index
)]
...
@@ -113,7 +113,6 @@ def do_distance(cooc_id, field1=None, field2=None, isMonopartite=True):
...
@@ -113,7 +113,6 @@ def do_distance(cooc_id, field1=None, field2=None, isMonopartite=True):
return
(
G
,
partition
,
ids
,
weight
)
return
(
G
,
partition
,
ids
,
weight
)
def
get_cooc
(
request
=
None
,
corpus
=
None
def
get_cooc
(
request
=
None
,
corpus
=
None
,
field1
=
'ngrams'
,
field2
=
'ngrams'
,
field1
=
'ngrams'
,
field2
=
'ngrams'
,
cooc_id
=
None
,
type
=
'node_link'
,
size
=
1000
,
cooc_id
=
None
,
type
=
'node_link'
,
size
=
1000
...
@@ -126,7 +125,7 @@ def get_cooc(request=None, corpus=None
...
@@ -126,7 +125,7 @@ def get_cooc(request=None, corpus=None
data
=
{}
data
=
{}
#if session.query(Node).filter(Node.type_id==type_cooc_id, Node.parent_id==corpus_id).first() is None:
#if session.query(Node).filter(Node.type_id==type_cooc_id, Node.parent_id==corpus_id).first() is None:
print
(
"Coocurrences do not exist yet, create it."
)
print
(
"Coocurrences do not exist yet, create it."
)
miam_id
=
get_or_create_node
(
nodetype
=
'M
iam
List'
,
corpus
=
corpus
)
.
id
miam_id
=
get_or_create_node
(
nodetype
=
'M
ap
List'
,
corpus
=
corpus
)
.
id
stop_id
=
get_or_create_node
(
nodetype
=
'StopList'
,
corpus
=
corpus
)
.
id
stop_id
=
get_or_create_node
(
nodetype
=
'StopList'
,
corpus
=
corpus
)
.
id
group_id
=
get_or_create_node
(
nodetype
=
'Group'
,
corpus
=
corpus
)
.
id
group_id
=
get_or_create_node
(
nodetype
=
'Group'
,
corpus
=
corpus
)
.
id
...
@@ -141,9 +140,9 @@ def get_cooc(request=None, corpus=None
...
@@ -141,9 +140,9 @@ def get_cooc(request=None, corpus=None
#cooc_id = get_or_create_node(nodetype='Cooccurrence', corpus=corpus).id
#cooc_id = get_or_create_node(nodetype='Cooccurrence', corpus=corpus).id
cooc_id
=
do_cooc
(
corpus
=
corpus
,
field1
=
"ngrams"
,
field2
=
"ngrams"
cooc_id
=
do_cooc
(
corpus
=
corpus
,
field1
=
"ngrams"
,
field2
=
"ngrams"
,
miam_id
=
miam_id
,
group_id
=
group_id
,
stop_id
=
stop_id
,
limit
=
size
,
miam_id
=
miam_id
,
group_id
=
group_id
,
stop_id
=
stop_id
,
limit
=
size
,
isMonopartite
=
isMonopartite
,
start
=
start
,
end
=
end
,
apax
=
apax
)
,
isMonopartite
=
True
,
start
=
start
,
end
=
end
,
apax
=
apax
)
G
,
partition
,
ids
,
weight
=
do_distance
(
cooc_id
,
field1
=
"ngrams"
,
field2
=
"ngrams"
,
isMonopartite
=
isMonopartit
e
)
G
,
partition
,
ids
,
weight
=
do_distance
(
cooc_id
,
field1
=
"ngrams"
,
field2
=
"ngrams"
,
isMonopartite
=
Tru
e
)
if
type
==
"node_link"
:
if
type
==
"node_link"
:
nodesB_dict
=
{}
nodesB_dict
=
{}
...
@@ -173,8 +172,8 @@ def get_cooc(request=None, corpus=None
...
@@ -173,8 +172,8 @@ def get_cooc(request=None, corpus=None
s
=
e
[
0
]
s
=
e
[
0
]
t
=
e
[
1
]
t
=
e
[
1
]
info
=
{
info
=
{
"s"
:
ids
[
s
][
1
]
,
"s"
:
ids
[
s
][
1
]
,
"t"
:
ids
[
t
][
1
]
,
"t"
:
ids
[
t
][
1
]
,
"w"
:
G
[
ids
[
s
][
1
]][
ids
[
t
][
1
]][
"weight"
]
"w"
:
G
[
ids
[
s
][
1
]][
ids
[
t
][
1
]][
"weight"
]
}
}
# print(info)
# print(info)
...
@@ -216,15 +215,13 @@ def get_cooc(request=None, corpus=None
...
@@ -216,15 +215,13 @@ def get_cooc(request=None, corpus=None
return
(
data
)
return
(
data
)
def
get_graphA
(
nodeA_type
,
NodesB
,
links
,
corpus
):
def
get_graphA
(
nodeA_type
,
NodesB
,
links
,
corpus
):
from
analysis.InterUnion
import
Utils
from
analysis.InterUnion
import
Utils
print
(
" = = = == = = = "
)
print
(
" = = = == = = = "
)
print
(
"In get_graphA(), corpus id:"
,
corpus
.
id
)
print
(
"In get_graphA(), corpus id:"
,
corpus
.
id
)
nodeA_type_id
=
cache
.
Hyperdata
[
nodeA_type
]
.
id
nodeA_type_id
=
cache
.
Hyperdata
[
nodeA_type
]
.
id
threshold_cotainf
=
0.0
5
threshold_cotainf
=
0.0
2
max_nodeid
=
-
1
max_nodeid
=
-
1
for
nodeid
in
NodesB
:
for
nodeid
in
NodesB
:
if
nodeid
>
max_nodeid
:
if
nodeid
>
max_nodeid
:
...
...
init.py
View file @
4c9cb5ea
...
@@ -91,7 +91,7 @@ print('Initialize node types...')
...
@@ -91,7 +91,7 @@ print('Initialize node types...')
node_types
=
[
node_types
=
[
'Root'
,
'Trash'
,
'Root'
,
'Trash'
,
'Project'
,
'Corpus'
,
'Document'
,
'Project'
,
'Corpus'
,
'Document'
,
'MiamList'
,
'StopList'
,
'MainList'
,
'MiamList'
,
'StopList'
,
'MainList'
,
'MapList'
,
# TODO MiamList -> MainList
'Stem'
,
'Lem'
,
'Group'
,
'Tfidf'
,
'Tfidf (global)'
,
'Cvalue'
,
'Specificity'
'Stem'
,
'Lem'
,
'Group'
,
'Tfidf'
,
'Tfidf (global)'
,
'Cvalue'
,
'Specificity'
,
'Cooccurrence'
,
,
'Cooccurrence'
,
]
]
...
...
ngram/group.py
View file @
4c9cb5ea
...
@@ -7,6 +7,7 @@ from gargantext_web.db import NodeNgram,NodeNodeNgram
...
@@ -7,6 +7,7 @@ from gargantext_web.db import NodeNgram,NodeNodeNgram
from
gargantext_web.db
import
*
from
gargantext_web.db
import
*
from
gargantext_web.db
import
get_or_create_node
from
gargantext_web.db
import
get_or_create_node
from
analysis.lists
import
Translations
,
UnweightedList
from
parsing.corpustools
import
*
from
parsing.corpustools
import
*
import
sqlalchemy
as
sa
import
sqlalchemy
as
sa
...
@@ -21,62 +22,7 @@ from collections import defaultdict
...
@@ -21,62 +22,7 @@ from collections import defaultdict
from
math
import
log
from
math
import
log
from
functools
import
reduce
from
functools
import
reduce
def
queryNodeNodeNgram
(
nodeMeasure_id
=
None
,
corpus_id
=
None
,
limit
=
None
):
'''
queryNodeNodeNgram :: Int -> Int -> Int -> (Int, String, Float)
Get list of ngrams according to a measure related to the corpus: maybe tfidf
cvalue.
'''
query
=
(
session
.
query
(
Ngram
.
id
,
Ngram
.
terms
,
NodeNodeNgram
.
score
)
.
join
(
NodeNodeNgram
,
NodeNodeNgram
.
ngram_id
==
Ngram
.
id
)
.
join
(
Node
,
Node
.
id
==
NodeNodeNgram
.
nodex_id
)
.
filter
(
NodeNodeNgram
.
nodex_id
==
nodeMeasure_id
)
.
filter
(
NodeNodeNgram
.
nodey_id
==
corpus_id
)
.
group_by
(
Ngram
.
id
,
Ngram
.
terms
,
NodeNodeNgram
.
score
)
.
order_by
(
desc
(
NodeNodeNgram
.
score
))
)
if
limit
is
None
:
query
=
query
.
count
()
elif
limit
==
0
:
query
=
query
.
all
()
else
:
query
=
query
.
limit
(
limit
)
return
(
query
)
def
getNgrams
(
corpus
=
None
,
limit_inf
=
600
,
limit_sup
=
3000
):
'''
getNgrams :: Corpus -> [(Int, String)] -> [(Int, String)]
For a corpus, gives list of highest Cvalue ngrams and highest TFIDF (global)
ngrams that have to be grouped with
'''
#tfidf_node = get_or_create_node(nodetype='Tfidf (global)', corpus=corpus)
cvalue_node
=
get_or_create_node
(
nodetype
=
'Cvalue'
,
corpus
=
corpus
)
spec_node
=
get_or_create_node
(
nodetype
=
'Specificity'
,
corpus
=
corpus
)
#tfidf_ngrams = queryNodeNodeNgram(nodeMeasure_id=tfidf_node.id, corpus_id=corpus.id)
cvalue_ngrams
=
queryNodeNodeNgram
(
nodeMeasure_id
=
cvalue_node
.
id
,
corpus_id
=
corpus
.
id
,
limit
=
limit_sup
)
spec_ngrams
=
queryNodeNodeNgram
(
nodeMeasure_id
=
spec_node
.
id
,
corpus_id
=
corpus
.
id
,
limit
=
limit_inf
)
#print([n for n in tfidf_ngrams])
def
list2set
(
_list
):
_set
=
set
()
for
n
in
_list
:
_set
.
add
((
n
[
0
],
n
[
1
]))
return
(
_set
)
cvalue_set
=
set
()
spec_set
=
set
()
cvalue_set
=
list2set
(
cvalue_ngrams
)
spec_set
=
list2set
(
spec_ngrams
)
cvalue_setDiff
=
cvalue_set
.
difference
(
spec_set
)
return
(
spec_set
,
cvalue_setDiff
)
def
getStemmer
(
corpus
):
def
getStemmer
(
corpus
):
'''
'''
...
@@ -121,17 +67,35 @@ def compute_groups(corpus, limit_inf=None, limit_sup=None, how='Stem'):
...
@@ -121,17 +67,35 @@ def compute_groups(corpus, limit_inf=None, limit_sup=None, how='Stem'):
miam_to_insert
=
set
()
miam_to_insert
=
set
()
miam_node
=
get_or_create_node
(
nodetype
=
'MiamList'
,
corpus
=
corpus
)
miam_node
=
get_or_create_node
(
nodetype
=
'MiamList'
,
corpus
=
corpus
)
stop_node
=
get_or_create_node
(
nodetype
=
'StopList'
,
corpus
=
corpus
)
#stop_list = UnweightedList(stop_node.id)
Stop
=
aliased
(
NodeNgram
)
frequency
=
sa
.
func
.
count
(
NodeNgram
.
weight
)
frequency
=
sa
.
func
.
count
(
NodeNgram
.
weight
)
ngrams
=
(
session
.
query
(
Ngram
.
id
,
Ngram
.
terms
,
frequency
)
ngrams
=
(
session
.
query
(
Ngram
.
id
,
Ngram
.
terms
,
frequency
)
.
join
(
NodeNgram
,
NodeNgram
.
ngram_id
==
Ngram
.
id
)
.
join
(
NodeNgram
,
NodeNgram
.
ngram_id
==
Ngram
.
id
)
.
join
(
Node
,
Node
.
id
==
NodeNgram
.
node_id
)
.
join
(
Node
,
Node
.
id
==
NodeNgram
.
node_id
)
#.outerjoin(Stop, Stop.ngram_id == Ngram.id)
#.filter(Stop.node_id == stop_node.id, Stop.ngram_id == None)
.
filter
(
Node
.
parent_id
==
corpus
.
id
,
Node
.
type_id
==
cache
.
NodeType
[
'Document'
]
.
id
)
.
filter
(
Node
.
parent_id
==
corpus
.
id
,
Node
.
type_id
==
cache
.
NodeType
[
'Document'
]
.
id
)
.
group_by
(
Ngram
.
id
)
.
group_by
(
Ngram
.
id
)
.
order_by
(
desc
(
frequency
))
.
order_by
(
desc
(
frequency
))
#.all()
#.all()
.
limit
(
limit_sup
)
.
limit
(
limit_sup
)
)
)
stops
=
(
session
.
query
(
Ngram
.
id
,
Ngram
.
terms
,
frequency
)
.
join
(
NodeNgram
,
NodeNgram
.
ngram_id
==
Ngram
.
id
)
.
join
(
Node
,
Node
.
id
==
NodeNgram
.
node_id
)
.
join
(
Stop
,
Stop
.
ngram_id
==
Ngram
.
id
)
.
filter
(
Stop
.
node_id
==
stop_node
.
id
)
.
filter
(
Node
.
parent_id
==
corpus
.
id
,
Node
.
type_id
==
cache
.
NodeType
[
'Document'
]
.
id
)
.
group_by
(
Ngram
.
id
)
.
all
()
)
ngrams
=
[
n
for
n
in
ngrams
if
n
not
in
stops
]
print
(
ngrams
)
#group = defaultdict(lambda : defaultdict())
#group = defaultdict(lambda : defaultdict())
ids_dict
=
dict
()
ids_dict
=
dict
()
mainform_dict
=
dict
()
mainform_dict
=
dict
()
...
...
ngram/m
iam
.py
→
ngram/m
apList
.py
View file @
4c9cb5ea
# Without this, we couldn't use the Django environment
# Without this, we couldn't use the Django environment
#
from admin.env import *
from
admin.env
import
*
#from ngram.stemLem import *
#from ngram.stemLem import *
from
admin.utils
import
PrintException
,
DebugTime
from
admin.utils
import
PrintException
,
DebugTime
...
@@ -15,42 +15,51 @@ from sqlalchemy.orm import aliased
...
@@ -15,42 +15,51 @@ from sqlalchemy.orm import aliased
from
ngram.tools
import
insert_ngrams
from
ngram.tools
import
insert_ngrams
import
csv
import
csv
def
compute_m
iam
(
corpus
,
limit
=
500
):
def
compute_m
apList
(
corpus
,
limit
=
500
):
'''
'''
According to Specificities and stoplist,
According to Specificities and stoplist,
'''
'''
dbg
=
DebugTime
(
'Corpus #
%
d - computing Miam'
%
corpus
.
id
)
dbg
=
DebugTime
(
'Corpus #
%
d - computing Miam'
%
corpus
.
id
)
node_
group
=
get_or_create_node
(
nodetype
=
'Group
'
,
corpus
=
corpus
)
node_
miam
=
get_or_create_node
(
nodetype
=
'MiamList
'
,
corpus
=
corpus
)
node_stop
=
get_or_create_node
(
nodetype
=
'StopList'
,
corpus
=
corpus
)
node_stop
=
get_or_create_node
(
nodetype
=
'StopList'
,
corpus
=
corpus
)
node_group
=
get_or_create_node
(
nodetype
=
'Group'
,
corpus
=
corpus
)
node_spec
=
get_or_create_node
(
nodetype
=
'Specificity'
,
corpus
=
corpus
)
node_spec
=
get_or_create_node
(
nodetype
=
'Specificity'
,
corpus
=
corpus
)
Miam
=
aliased
(
NodeNgram
)
Stop
=
aliased
(
NodeNgram
)
Stop
=
aliased
(
NodeNgram
)
Group
=
aliased
(
NodeNgramNgram
)
Group
=
aliased
(
NodeNgramNgram
)
Spec
=
aliased
(
NodeNodeNgram
)
Spec
=
aliased
(
NodeNodeNgram
)
top_miam
=
(
session
.
query
(
Spec
.
ngram_id
,
Spec
.
score
)
top_ngrams
=
(
session
.
query
(
Spec
.
ngram_id
,
Spec
.
score
)
.
outerjoin
(
Group
,
Group
.
ngramy_id
==
Spec
.
ngram_id
)
.
join
(
Miam
,
Spec
.
ngram_id
==
Miam
.
ngram_id
)
.
outerjoin
(
Stop
,
Stop
.
ngram_id
==
Spec
.
ngram_id
)
#.outerjoin(Group, Group.ngramy_id == Spec.ngram_id)
.
filter
(
Group
.
node_id
==
node_group
.
id
)
#.outerjoin(Stop, Stop.ngram_id == Spec.ngram_id)
.
filter
(
Stop
.
node_id
==
node_stop
.
id
)
.
filter
(
Miam
.
node_id
==
node_miam
.
id
)
#.filter(Group.node_id == node_group.id)
#.filter(Stop.node_id == node_stop.id)
.
filter
(
Spec
.
nodex_id
==
node_spec
.
id
)
.
order_by
(
desc
(
Spec
.
score
))
.
order_by
(
desc
(
Spec
.
score
))
.
limit
(
limit
)
.
limit
(
limit
)
)
)
print
([
t
for
t
in
top_miam
])
node_miam
=
get_or_create_node
(
nodetype
=
'MiamList'
,
corpus
=
corpus
)
#print([t for t in top_ngrams])
session
.
query
(
NodeNgram
)
.
filter
(
NodeNgram
.
node_id
==
node_miam
.
id
)
.
delete
()
node_mapList
=
get_or_create_node
(
nodetype
=
'MapList'
,
corpus
=
corpus
)
session
.
query
(
NodeNgram
)
.
filter
(
NodeNgram
.
node_id
==
node_mapList
.
id
)
.
delete
()
session
.
commit
()
session
.
commit
()
data
=
zip
(
data
=
zip
(
[
node_m
iam
.
id
for
i
in
range
(
1
,
limit
)]
[
node_m
apList
.
id
for
i
in
range
(
1
,
limit
)]
,
[
n
[
0
]
for
n
in
top_
miam
]
,
[
n
[
0
]
for
n
in
top_
ngrams
]
,
[
1
for
i
in
range
(
1
,
limit
)]
,
[
1
for
i
in
range
(
1
,
limit
)]
)
)
print
([
d
for
d
in
data
])
#
print([d for d in data])
bulk_insert
(
NodeNgram
,
[
'node_id'
,
'ngram_id'
,
'weight'
],
[
d
for
d
in
data
])
bulk_insert
(
NodeNgram
,
[
'node_id'
,
'ngram_id'
,
'weight'
],
[
d
for
d
in
data
])
dbg
.
show
(
'M
iam
computed'
)
dbg
.
show
(
'M
apList
computed'
)
def
insert_miam
(
corpus
,
ngrams
=
None
,
path_file_csv
=
None
):
def
insert_miam
(
corpus
,
ngrams
=
None
,
path_file_csv
=
None
):
dbg
=
DebugTime
(
'Corpus #
%
d - computing Miam'
%
corpus
.
id
)
dbg
=
DebugTime
(
'Corpus #
%
d - computing Miam'
%
corpus
.
id
)
...
@@ -87,8 +96,41 @@ def insert_miam(corpus, ngrams=None, path_file_csv=None):
...
@@ -87,8 +96,41 @@ def insert_miam(corpus, ngrams=None, path_file_csv=None):
file_csv
.
close
()
file_csv
.
close
()
dbg
.
show
(
'Miam computed'
)
dbg
.
show
(
'Miam computed'
)
#corpus = session.query(Node).filter(Node.id==556113).first()
#corpus = session.query(Node).filter(Node.id==540420).first()
#compute_mapList(corpus)
#insert_miam(corpus=corpus, path_file_csv="Thesaurus_tag.csv")
#insert_miam(corpus=corpus, path_file_csv="Thesaurus_tag.csv")
#def getNgrams(corpus=None, limit_inf=600, limit_sup=3000):
# '''
# getNgrams :: Corpus -> [(Int, String)] -> [(Int, String)]
# For a corpus, gives list of highest Cvalue ngrams and highest TFIDF (global)
# ngrams that have to be grouped with
# '''
# #tfidf_node = get_or_create_node(nodetype='Tfidf (global)', corpus=corpus)
# cvalue_node = get_or_create_node(nodetype='Cvalue', corpus=corpus)
# spec_node = get_or_create_node(nodetype='Specificity', corpus=corpus)
#
#
# #tfidf_ngrams = queryNodeNodeNgram(nodeMeasure_id=tfidf_node.id, corpus_id=corpus.id)
# cvalue_ngrams = queryNodeNodeNgram(nodeMeasure_id=cvalue_node.id, corpus_id=corpus.id, limit=limit_sup)
# spec_ngrams = queryNodeNodeNgram(nodeMeasure_id=spec_node.id, corpus_id=corpus.id, limit=limit_inf)
#
# #print([n for n in tfidf_ngrams])
#
# def list2set(_list):
# _set = set()
# for n in _list:
# _set.add((n[0],n[1]))
# return(_set)
#
# cvalue_set = set()
# spec_set = set()
#
# cvalue_set = list2set(cvalue_ngrams)
# spec_set = list2set(spec_ngrams)
#
# cvalue_setDiff = cvalue_set.difference(spec_set)
#
# return(spec_set,cvalue_setDiff)
#
ngram/stop.py
View file @
4c9cb5ea
# Without this, we couldn't use the Django environment
#from admin.env import *
#from ngram.stemLem import *
import
re
import
re
from
admin.utils
import
PrintException
from
admin.utils
import
PrintException
from
gargantext_web.db
import
NodeNgram
,
NodeNodeNgram
from
gargantext_web.db
import
Node
,
Ngram
,
Node
Ngram
,
NodeNodeNgram
from
gargantext_web.db
import
cache
,
session
,
get_or_create_node
from
gargantext_web.db
import
cache
,
session
,
get_or_create_node
,
bulk_insert
import
sqlalchemy
as
sa
from
sqlalchemy.sql
import
func
from
sqlalchemy.sql
import
func
from
sqlalchemy
import
desc
,
asc
,
or_
,
and_
,
Date
,
cast
,
select
from
sqlalchemy
import
desc
,
asc
,
or_
,
and_
,
Date
,
cast
,
select
from
sqlalchemy
import
literal_column
from
sqlalchemy
import
literal_column
...
@@ -38,7 +35,6 @@ def importStopList(node,filename,language='fr'):
...
@@ -38,7 +35,6 @@ def importStopList(node,filename,language='fr'):
bulk_insert
(
NodeNgram
,
[
'node_id'
,
'ngram_id'
,
'weight'
],
[
d
for
d
in
data
])
bulk_insert
(
NodeNgram
,
[
'node_id'
,
'ngram_id'
,
'weight'
],
[
d
for
d
in
data
])
def
isStopWord
(
ngram
,
stop_words
=
None
):
def
isStopWord
(
ngram
,
stop_words
=
None
):
'''
'''
ngram :: (Int, String) => (ngram_id, ngram_terms)
ngram :: (Int, String) => (ngram_id, ngram_terms)
...
@@ -55,8 +51,9 @@ def isStopWord(ngram, stop_words=None):
...
@@ -55,8 +51,9 @@ def isStopWord(ngram, stop_words=None):
if
format_regex
.
match
(
word
)
:
if
format_regex
.
match
(
word
)
:
return
(
True
)
return
(
True
)
for
regex
in
[
"(.*)
\
d(.*)"
for
regex
in
[
,
"^.{1,2}$"
"^.{1,2}$"
,
"(.*)
\
d(.*)"
,
"(.*)(
\
.)(.*)"
,
"(.*)(
\
.)(.*)"
,
"(.*)(
\
,)(.*)"
,
"(.*)(
\
,)(.*)"
,
"(.*)(study)(.*)"
,
"(.*)(study)(.*)"
...
@@ -73,13 +70,11 @@ def isStopWord(ngram, stop_words=None):
...
@@ -73,13 +70,11 @@ def isStopWord(ngram, stop_words=None):
if
test_match
(
word
,
regex
)
is
True
:
if
test_match
(
word
,
regex
)
is
True
:
return
(
True
)
return
(
True
)
def
compute_stop
(
corpus
,
limit
=
2000
,
debug
=
False
):
def
compute_stop
(
corpus
,
size
=
2000
,
debug
=
False
):
'''
'''
do some statitics on all stop lists of database of the same type
do some statitics on all stop lists of database of the same type
'''
'''
stop_node
=
get_or_create_node
(
nodetype
=
'StopList'
,
corpus
=
corpus
)
stop_node
=
get_or_create_node
(
nodetype
=
'StopList'
,
corpus
=
corpus
)
miam_node
=
get_or_create_node
(
nodetype
=
'MiamList'
,
corpus
=
corpus
)
# TODO do a function to get all stop words with social scores
# TODO do a function to get all stop words with social scores
root
=
session
.
query
(
Node
)
.
filter
(
Node
.
type_id
==
cache
.
NodeType
[
'Root'
]
.
id
)
.
first
()
root
=
session
.
query
(
Node
)
.
filter
(
Node
.
type_id
==
cache
.
NodeType
[
'Root'
]
.
id
)
.
first
()
...
@@ -90,34 +85,26 @@ def compute_stop(corpus,size=2000,debug=False):
...
@@ -90,34 +85,26 @@ def compute_stop(corpus,size=2000,debug=False):
.
filter
(
NodeNgram
.
node_id
==
root_stop_id
)
.
filter
(
NodeNgram
.
node_id
==
root_stop_id
)
.
all
()
.
all
()
)
)
#print([n for n in stop_words])
frequency
=
sa
.
func
.
count
(
NodeNgram
.
weight
)
ngrams
=
(
session
.
query
(
Ngram
.
id
,
Ngram
.
terms
,
frequency
)
.
join
(
NodeNgram
,
NodeNgram
.
ngram_id
==
Ngram
.
id
)
.
join
(
Node
,
Node
.
id
==
NodeNgram
.
node_id
)
.
filter
(
Node
.
parent_id
==
corpus
.
id
,
Node
.
type_id
==
cache
.
NodeType
[
'Document'
]
.
id
)
.
group_by
(
Ngram
.
id
)
.
order_by
(
desc
(
frequency
)
)
.
all
()
#.limit(limit)
)
top_words
=
(
session
.
query
(
Ngram
.
id
,
Ngram
.
terms
)
.
join
(
NodeNgram
,
NodeNgram
.
ngram_id
==
Ngram
.
id
)
.
filter
(
NodeNgram
.
node_id
==
miam_node
.
id
)
.
order_by
(
desc
(
NodeNgram
.
weight
))
.
limit
(
size
)
)
ngrams_to_stop
=
filter
(
lambda
x
:
isStopWord
(
x
,
stop_words
=
stop_words
),
top_word
s
)
ngrams_to_stop
=
filter
(
lambda
x
:
isStopWord
(
x
,
stop_words
=
stop_words
),
ngram
s
)
#print([n for n in ngrams_to_stop])
stop
=
WeightedList
({
n
[
0
]
:
-
1
for
n
in
ngrams_to_stop
})
stop
=
WeightedList
({
n
[
0
]
:
-
1
for
n
in
ngrams_to_stop
})
stop
.
save
(
stop_node
.
id
)
stop
.
save
(
stop_node
.
id
)
miam
=
UnweightedList
(
miam_node
.
id
)
new_miam
=
miam
-
stop
new_miam
.
save
(
miam_node
.
id
)
# data = zip(
# [stop_node.id for i in range(0,size)]
# , [ngram[0] for ngram in ngrams_to_stop]
# , [-1 for i in range(0,size)]
# )
# bulk_insert(NodeNgram, ['node_id', 'ngram_id', 'weight'], [d for d in data])
#corpus=session.query(Node).filter(Node.id==545461).first()
#compute_stop(corpus)
ngram/tools.py
View file @
4c9cb5ea
...
@@ -109,3 +109,30 @@ def insert_nodengramngram(nodengramngram):
...
@@ -109,3 +109,30 @@ def insert_nodengramngram(nodengramngram):
'''
%
(
NodeNgramNgram
.
__table__
.
name
,))
'''
%
(
NodeNgramNgram
.
__table__
.
name
,))
db
.
commit
()
db
.
commit
()
#def queryNodeNodeNgram(nodeMeasure_id=None, corpus_id=None, limit=None):
# '''
# queryNodeNodeNgram :: Int -> Int -> Int -> (Int, String, Float)
# Get list of ngrams according to a measure related to the corpus: maybe tfidf
# cvalue.
# '''
# query = (session.query(Ngram.id, Ngram.terms, NodeNodeNgram.score)
# .join(NodeNodeNgram, NodeNodeNgram.ngram_id == Ngram.id)
# .join(Node, Node.id == NodeNodeNgram.nodex_id)
# .filter(NodeNodeNgram.nodex_id == nodeMeasure_id)
# .filter(NodeNodeNgram.nodey_id == corpus_id)
# .group_by(Ngram.id, Ngram.terms, NodeNodeNgram.score)
# .order_by(desc(NodeNodeNgram.score))
# )
#
# if limit is None:
# query = query.count()
# elif limit == 0 :
# query = query.all()
# else:
# query = query.limit(limit)
#
# return(query)
#
ngram/workflow.py
View file @
4c9cb5ea
...
@@ -4,8 +4,9 @@ from ngram.cvalue import compute_cvalue
...
@@ -4,8 +4,9 @@ from ngram.cvalue import compute_cvalue
from
ngram.specificity
import
compute_specificity
from
ngram.specificity
import
compute_specificity
#from ngram.stop import compute_stop
#from ngram.stop import compute_stop
from
ngram.group
import
compute_groups
from
ngram.group
import
compute_groups
from
ngram.miam
import
compute_miam
from
gargantext_web.db
import
get_or_create_node
from
gargantext_web.db
import
get_or_create_node
from
ngram.mapList
import
compute_mapList
#from gargantext_web.celery import update_processing
#from gargantext_web.celery import update_processing
...
@@ -13,31 +14,32 @@ def ngram_workflow(corpus, n=5000):
...
@@ -13,31 +14,32 @@ def ngram_workflow(corpus, n=5000):
'''
'''
All the workflow to filter the ngrams.
All the workflow to filter the ngrams.
'''
'''
compute_tfidf_global
(
corpus
)
#
compute_tfidf_global(corpus)
part
=
round
(
n
*
0.
8
)
part
=
round
(
n
*
0.
9
)
compute_cvalue
(
corpus
,
limit
=
part
)
# size
#
compute_cvalue(corpus,limit=part) # size
part
=
round
(
part
*
0.
4
)
part
=
round
(
part
*
0.
8
)
print
(
'spec part:'
,
part
)
print
(
'spec part:'
,
part
)
compute_specificity
(
corpus
,
limit
=
part
)
#
compute_specificity(corpus,limit=part)
part
=
round
(
part
*
0.
5
)
part
=
round
(
part
*
0.
8
)
# compute_stop(corpus)
# compute_stop(corpus)
limit_inf
=
round
(
part
*
1
)
limit_inf
=
round
(
part
*
1
)
limit_sup
=
round
(
part
*
5
)
limit_sup
=
round
(
part
*
5
)
print
(
limit_inf
,
limit_sup
)
print
(
limit_inf
,
limit_sup
)
compute_groups
(
corpus
,
limit_inf
=
limit_inf
,
limit_sup
=
limit_sup
)
#
compute_groups(corpus,limit_inf=limit_inf, limit_sup=limit_sup)
# compute_miam
(corpus,limit=part) # size
compute_mapList
(
corpus
,
limit
=
part
)
# size
compute_tfidf
(
corpus
)
#
compute_tfidf(corpus)
#corpus=session.query(Node).filter(Node.id==257579).first()
#corpus=session.query(Node).filter(Node.id==540420).first()
#corpus=session.query(Node).filter(Node.id==559637).first()
#ngram_workflow(corpus)
#ngram_workflow(corpus)
#update_processing(corpus, 0)
#update_processing(corpus, 0)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment