Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
gargantext
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
humanities
gargantext
Commits
c4f45aea
Commit
c4f45aea
authored
Nov 06, 2015
by
PkSM3
Browse files
Options
Browse Files
Download
Plain Diff
Merge branch 'unstable' of
ssh://delanoe.org:1979/gargantext
into samuel
parents
f3375ba6
962d6b85
Changes
10
Show whitespace changes
Inline
Side-by-side
Showing
10 changed files
with
1541 additions
and
135 deletions
+1541
-135
cooccurrences.py
analysis/cooccurrences.py
+2
-1
functions.py
analysis/functions.py
+9
-12
init.py
init.py
+1
-1
en.txt
init/stop_lists/en.txt
+464
-0
fr.txt
init/stop_lists/fr.txt
+922
-0
group.py
ngram/group.py
+20
-56
mapList.py
ngram/mapList.py
+59
-17
stop.py
ngram/stop.py
+24
-37
tools.py
ngram/tools.py
+27
-0
workflow.py
ngram/workflow.py
+13
-11
No files found.
analysis/cooccurrences.py
View file @
c4f45aea
...
@@ -72,7 +72,8 @@ def do_cooc(corpus=None
...
@@ -72,7 +72,8 @@ def do_cooc(corpus=None
if
isMonopartite
:
if
isMonopartite
:
NodeNgramX
=
aliased
(
NodeNgram
)
NodeNgramX
=
aliased
(
NodeNgram
)
NodeNgramY
=
aliased
(
NodeNgram
)
NodeNgramY
=
aliased
(
NodeNgram
)
cooc_score
=
func
.
sqrt
(
func
.
sum
(
NodeNgramX
.
weight
)
*
func
.
sum
(
NodeNgramY
.
weight
))
.
label
(
'cooc_score'
)
cooc_score
=
func
.
sum
(
NodeNgramX
.
weight
+
NodeNgramY
.
weight
)
.
label
(
'cooc_score'
)
#cooc_score = func.sqrt(func.sum(NodeNgramX.weight * NodeNgramY.weight)).label('cooc_score')
cooc_query
=
(
session
.
query
(
NodeNgramX
.
ngram_id
,
NodeNgramY
.
ngram_id
,
cooc_score
)
cooc_query
=
(
session
.
query
(
NodeNgramX
.
ngram_id
,
NodeNgramY
.
ngram_id
,
cooc_score
)
.
join
(
Node
,
Node
.
id
==
NodeNgramX
.
node_id
)
.
join
(
Node
,
Node
.
id
==
NodeNgramX
.
node_id
)
...
...
analysis/functions.py
View file @
c4f45aea
...
@@ -29,7 +29,6 @@ from sqlalchemy.orm import aliased
...
@@ -29,7 +29,6 @@ from sqlalchemy.orm import aliased
def
diag_null
(
x
):
def
diag_null
(
x
):
return
x
-
x
*
scipy
.
eye
(
x
.
shape
[
0
])
return
x
-
x
*
scipy
.
eye
(
x
.
shape
[
0
])
def
do_distance
(
cooc_id
,
field1
=
None
,
field2
=
None
,
isMonopartite
=
True
):
def
do_distance
(
cooc_id
,
field1
=
None
,
field2
=
None
,
isMonopartite
=
True
):
'''
'''
do_distance :: Int -> (Graph, Partition, {ids}, {weight})
do_distance :: Int -> (Graph, Partition, {ids}, {weight})
...
@@ -75,10 +74,10 @@ def do_distance(cooc_id, field1=None, field2=None, isMonopartite=True):
...
@@ -75,10 +74,10 @@ def do_distance(cooc_id, field1=None, field2=None, isMonopartite=True):
n
=
n
.
sort
(
inplace
=
False
)
n
=
n
.
sort
(
inplace
=
False
)
m
=
m
.
sort
(
inplace
=
False
)
m
=
m
.
sort
(
inplace
=
False
)
nodes_included
=
3
00
#int(round(size/20,0))
nodes_included
=
5
00
#int(round(size/20,0))
#nodes_excluded = int(round(size/10,0))
#nodes_excluded = int(round(size/10,0))
nodes_specific
=
3
00
#int(round(size/10,0))
nodes_specific
=
5
00
#int(round(size/10,0))
#nodes_generic = int(round(size/10,0))
#nodes_generic = int(round(size/10,0))
# TODO use the included score for the node size
# TODO use the included score for the node size
...
@@ -87,6 +86,7 @@ def do_distance(cooc_id, field1=None, field2=None, isMonopartite=True):
...
@@ -87,6 +86,7 @@ def do_distance(cooc_id, field1=None, field2=None, isMonopartite=True):
#m_index = pd.Index.intersection(x.index, m.index[:nodes_generic])
#m_index = pd.Index.intersection(x.index, m.index[:nodes_generic])
# Specific:
# Specific:
m_index
=
pd
.
Index
.
intersection
(
x
.
index
,
m
.
index
[
-
nodes_specific
:])
m_index
=
pd
.
Index
.
intersection
(
x
.
index
,
m
.
index
[
-
nodes_specific
:])
#m_index = pd.Index.intersection(x.index, n.index[:nodes_included])
x_index
=
pd
.
Index
.
union
(
n_index
,
m_index
)
x_index
=
pd
.
Index
.
union
(
n_index
,
m_index
)
xx
=
x
[
list
(
x_index
)]
.
T
[
list
(
x_index
)]
xx
=
x
[
list
(
x_index
)]
.
T
[
list
(
x_index
)]
...
@@ -113,7 +113,6 @@ def do_distance(cooc_id, field1=None, field2=None, isMonopartite=True):
...
@@ -113,7 +113,6 @@ def do_distance(cooc_id, field1=None, field2=None, isMonopartite=True):
return
(
G
,
partition
,
ids
,
weight
)
return
(
G
,
partition
,
ids
,
weight
)
def
get_cooc
(
request
=
None
,
corpus
=
None
def
get_cooc
(
request
=
None
,
corpus
=
None
,
field1
=
'ngrams'
,
field2
=
'ngrams'
,
field1
=
'ngrams'
,
field2
=
'ngrams'
,
cooc_id
=
None
,
type
=
'node_link'
,
size
=
1000
,
cooc_id
=
None
,
type
=
'node_link'
,
size
=
1000
...
@@ -126,7 +125,7 @@ def get_cooc(request=None, corpus=None
...
@@ -126,7 +125,7 @@ def get_cooc(request=None, corpus=None
data
=
{}
data
=
{}
#if session.query(Node).filter(Node.type_id==type_cooc_id, Node.parent_id==corpus_id).first() is None:
#if session.query(Node).filter(Node.type_id==type_cooc_id, Node.parent_id==corpus_id).first() is None:
print
(
"Coocurrences do not exist yet, create it."
)
print
(
"Coocurrences do not exist yet, create it."
)
miam_id
=
get_or_create_node
(
nodetype
=
'M
iam
List'
,
corpus
=
corpus
)
.
id
miam_id
=
get_or_create_node
(
nodetype
=
'M
ap
List'
,
corpus
=
corpus
)
.
id
stop_id
=
get_or_create_node
(
nodetype
=
'StopList'
,
corpus
=
corpus
)
.
id
stop_id
=
get_or_create_node
(
nodetype
=
'StopList'
,
corpus
=
corpus
)
.
id
group_id
=
get_or_create_node
(
nodetype
=
'Group'
,
corpus
=
corpus
)
.
id
group_id
=
get_or_create_node
(
nodetype
=
'Group'
,
corpus
=
corpus
)
.
id
...
@@ -141,9 +140,9 @@ def get_cooc(request=None, corpus=None
...
@@ -141,9 +140,9 @@ def get_cooc(request=None, corpus=None
#cooc_id = get_or_create_node(nodetype='Cooccurrence', corpus=corpus).id
#cooc_id = get_or_create_node(nodetype='Cooccurrence', corpus=corpus).id
cooc_id
=
do_cooc
(
corpus
=
corpus
,
field1
=
"ngrams"
,
field2
=
"ngrams"
cooc_id
=
do_cooc
(
corpus
=
corpus
,
field1
=
"ngrams"
,
field2
=
"ngrams"
,
miam_id
=
miam_id
,
group_id
=
group_id
,
stop_id
=
stop_id
,
limit
=
size
,
miam_id
=
miam_id
,
group_id
=
group_id
,
stop_id
=
stop_id
,
limit
=
size
,
isMonopartite
=
isMonopartite
,
start
=
start
,
end
=
end
,
apax
=
apax
)
,
isMonopartite
=
True
,
start
=
start
,
end
=
end
,
apax
=
apax
)
G
,
partition
,
ids
,
weight
=
do_distance
(
cooc_id
,
field1
=
"ngrams"
,
field2
=
"ngrams"
,
isMonopartite
=
isMonopartit
e
)
G
,
partition
,
ids
,
weight
=
do_distance
(
cooc_id
,
field1
=
"ngrams"
,
field2
=
"ngrams"
,
isMonopartite
=
Tru
e
)
if
type
==
"node_link"
:
if
type
==
"node_link"
:
nodesB_dict
=
{}
nodesB_dict
=
{}
...
@@ -173,8 +172,8 @@ def get_cooc(request=None, corpus=None
...
@@ -173,8 +172,8 @@ def get_cooc(request=None, corpus=None
s
=
e
[
0
]
s
=
e
[
0
]
t
=
e
[
1
]
t
=
e
[
1
]
info
=
{
info
=
{
"s"
:
ids
[
s
][
1
]
,
"s"
:
ids
[
s
][
1
]
,
"t"
:
ids
[
t
][
1
]
,
"t"
:
ids
[
t
][
1
]
,
"w"
:
G
[
ids
[
s
][
1
]][
ids
[
t
][
1
]][
"weight"
]
"w"
:
G
[
ids
[
s
][
1
]][
ids
[
t
][
1
]][
"weight"
]
}
}
# print(info)
# print(info)
...
@@ -216,15 +215,13 @@ def get_cooc(request=None, corpus=None
...
@@ -216,15 +215,13 @@ def get_cooc(request=None, corpus=None
return
(
data
)
return
(
data
)
def
get_graphA
(
nodeA_type
,
NodesB
,
links
,
corpus
):
def
get_graphA
(
nodeA_type
,
NodesB
,
links
,
corpus
):
from
analysis.InterUnion
import
Utils
from
analysis.InterUnion
import
Utils
print
(
" = = = == = = = "
)
print
(
" = = = == = = = "
)
print
(
"In get_graphA(), corpus id:"
,
corpus
.
id
)
print
(
"In get_graphA(), corpus id:"
,
corpus
.
id
)
nodeA_type_id
=
cache
.
Hyperdata
[
nodeA_type
]
.
id
nodeA_type_id
=
cache
.
Hyperdata
[
nodeA_type
]
.
id
threshold_cotainf
=
0.0
5
threshold_cotainf
=
0.0
2
max_nodeid
=
-
1
max_nodeid
=
-
1
for
nodeid
in
NodesB
:
for
nodeid
in
NodesB
:
if
nodeid
>
max_nodeid
:
if
nodeid
>
max_nodeid
:
...
...
init.py
View file @
c4f45aea
...
@@ -91,7 +91,7 @@ print('Initialize node types...')
...
@@ -91,7 +91,7 @@ print('Initialize node types...')
node_types
=
[
node_types
=
[
'Root'
,
'Trash'
,
'Root'
,
'Trash'
,
'Project'
,
'Corpus'
,
'Document'
,
'Project'
,
'Corpus'
,
'Document'
,
'MiamList'
,
'StopList'
,
'MainList'
,
'MiamList'
,
'StopList'
,
'MainList'
,
'MapList'
,
# TODO MiamList -> MainList
'Stem'
,
'Lem'
,
'Group'
,
'Tfidf'
,
'Tfidf (global)'
,
'Cvalue'
,
'Specificity'
'Stem'
,
'Lem'
,
'Group'
,
'Tfidf'
,
'Tfidf (global)'
,
'Cvalue'
,
'Specificity'
,
'Cooccurrence'
,
,
'Cooccurrence'
,
]
]
...
...
init/stop_lists/en.txt
0 → 100644
View file @
c4f45aea
-
-
%
a
aboard
about
above
academicpress
according
across
activity
actual
added
address the problem
after
again
against
ahead
aim
aim of this study
all
almost
alone
along
alongside
also
although
am
americansociety
amid
amidst
among
amongst
an
and
and-or
and/or
anon
another
anti
any
anybody
anyone
anything
are
arising
around
article
as
astride
at
aught
author reports
authors report
average size
award
away
bar
barring
basis sets
be
because
become
becomes
been
before
behind
being
below
beneath
beside
besides
best
better
between
beyond
birthday
both
broad range
but
by
calculated results
calculations show
can
case
case studies
case study
certain
choice
circa
come
comes
coming
completely
concerning
conclusion
consider
considered
considering
consisting
copyright
crucial role
data
de
degreec
degreesc
degrees c
department
der
despite
did
different function
different functions
different type
different types
discussion
do
does
doesnt
doing
don
down
dr
du
due
during
each
effect
either
elsevier
elsevier ltd
elsevier science
enough
especially
et
everybody
everyone
excellent agreement
except
excepting
excluding
few
fewer
first report
first step
first time
following
for
forward
from
further
further analysis
further evidence
further investigation
future research
get
give
given
giving
good agreement
had
has
have
having
he
her
here
hers
herself
high concentration
high concentrations
higher
higher level
higher levels
higher rates
high level
high levels
high rate
him
himself
his
hisself
honor
how
i
idem
if
ilk
important role
important source
in
including
inside
instead
into
introduction
is
it
items
its
itself
john wiley
just
key role
large number
let
lets
level
levels
like
literature
literature review
little
long term
look
looks
low concentration
low concentrations
low level
low levels
ltd
made
main objective
major role
make
makes
making
many
me
meet
meets
mine
minus
month
months
more
most
much
mu g
mu m
must
my
myself
naught
near
nearly
neither
new approach
new method
new records
next
no
nobody
none
nor
not
nothing
notwithstanding
now
number
number
objective
of
off
on
once
oneself
only
onto
opposite
or
other
other factors
other hand
otherwise
our
ours
ourself
ourselves
out
outside
over
overall
own
paper
paper addresses
past
pending
per
plus
possibly
preliminary results
present algorithms
present study
present work
proposed approach
proposed method
proposed model
proposed system
pt
put
range
rate
rationale
really
recent studies
recent years
regarding
relative importance
report
reprinted
result
results
results show
review
review of literature
review of the literature
review recent
round
s
same
same time
save
second step
seen
selection
self
set
sets
several
she
should
show
shown
significant correlation
significant differences
significant increase
simple method
simulation results show
since
small number
so
so-called
some
somebody
someone
something
somewhat
spp
studies
study
such
suchlike
such systems
sufficient conditions
sundry
t
take
taken
takes
taking
test the hypothesis
than
that
the
thee
their
theirs
them
themselves
then
there
therefrom
these
they
thine
third step
this
those
thou
though
through
throughout
thyself
till
time scale
to
together
too
total
tother
toward
towards
twain
under
undergoing
underneath
unless
unlike
until
up
upon
upward
us
use
various
versus
very
via
vis-a-vis
vol
vols
vs
was
way
ways
we
were
what
whatall
whatever
whats
whatsoever
when
where
whereas
wherewith
wherewithal
which
whichever
whichsoever
while
whither
who
whoever
whom
whomever
whomso
whomsoever
whos
whose
whosoever
why
wide range
will
with
within
without
worth
ye
year
yet
yon
yonder
you
you-all
your
your
yours
yourself
yourselves
init/stop_lists/fr.txt
0 → 100644
View file @
c4f45aea
<
>
%
a
à
abord
afin
afin d'
afin de
afin qu'
afin que
ai
aie
aient
aies
ainsi
ainsi qu'
ainsi que
ait
à ladite
à laquelle
à la suite d'
à la suite de
à la suite des
à la suite du
à l'égard d'
à l'égard de
à l'égard des
à l'égard du
à l'encontre d'
à l'encontre de
à l'encontre des
à l'encontre du
à l'instar d'
à l'instar de
à l'instar des
à l'instar du
à l'insu d'
à l'insu de
à l'insu des
à l'insu du
à l'issue d'
à l'issue de
à l'issue des
à l'issue du
allaient
allo
allons
à l'occasion d'
à l'occasion de
à l'occasion des
à l'occasion du
alors
alors qu'
alors que
à même d'
à même de
an
and
aou
août
à partir d'
à partir de
à partir des
à partir du
après
après qu'
après que
are
as
assez
as-tu
at
à travers
attendu
au
au cours d'
au cours de
au cours des
au cours du
aucun
aucune
au-delà d'
au-delà de
au-delà des
au-delà du
au-devant d'
au-devant de
au-devant des
au-devant du
aujourd
aujourd'hui
auprès
auprès d'
auprès de
auprès des
auprès du
auquel
aura
aurai
auraient
aurais
aurait
auras
au regard d'
au regard de
au regard des
au regard du
aurez
auriez
aurions
aurons
auront
au sein d'
au sein de
au sein des
au sein du
aussi
aussi bien qu'
aussi bien que
au sujet d'
au sujet de
au sujet des
au sujet du
autour d'
autour de
autour des
autour du
au travers d'
au travers de
au travers des
au travers du
autre
autres
autrui
aux
auxdites
auxdits
auxquelles
auxquels
avaient
avais
avait
avant
avant qu'
avant que
avec
avez
aviez
avions
avoir
avons
avr
ayant
ayante
ayantes
ayants
ayez
ayons
bah
banco
be
beaucoup
ben
bien
bigre
boum
bravo
brrr
but
by
c
car
ce
ceci
cela
celà
celle
celle-ci
celle-là
celles
celles-ci
celles-là
celui
celui-ci
celui-là
cent
cents
cependant
certain
certaine
certainement
certaines
certains
certes
ces
c'est
cet
cette
ceux
ceux-ci
ceux-là
cf.
cgr
chacun
chacune
chaque
cher
chère
chères
chers
chez
chiche
chut
cinq
cinquantaine
cinquante
cinquante-cinq
cinquante-deux
cinquante et un
cinquante-et-un
cinquante et une
cinquante-huit
cinquante-neuf
cinquante-quatre
cinquante-sept
cinquante-six
cinquante-trois
cinquantième
cinquième
clac
clic
cm²
combien
comme
comment
compris
concernant
contre
couic
crac
d
dans
d'après
d'autres
de
debout
dec
décembre
dedans
dehors
déjà
delà
de ladite
de la part d'
de la part de
de la part des
de la part du
de laquelle
de même qu'
de même que
d'entre
de peur qu'
de peur que
depuis
derrière
des
dès
desdites
desdits
dès lors qu'
dès lors que
désormais
dès qu'
dès que
desquelles
desquels
dessous
dessus
d'être
deux
deuxième
deuxièmement
devant
devers
devra
différent
différente
différentes
différents
dimanche
dire
divers
diverse
diverses
dix
dix-huit
dixième
dix-neuf
dix-sept
doit
doivent
donc
dont
douze
douzième
dring
du
dudit
d'un
d'une
duquel
durant
effet
elle
elle-même
elles
elles-mêmes
en
encore
en cours d'
en cours de
en deçà
en-dehors d'
en-dehors de
en-dehors des
en-dehors du
en dépit d'
en dépit de
en dépit des
en dépit du
en faveur d'
en faveur de
en faveur des
en faveur du
en marge d'
en marge de
en marge des
en marge du
en matière d'
en matière de
en raison d'
en raison de
en raison des
en raison du
entre
envers
en vertu d'
en vertu de
en vertu du
environ
en vue d'
en vue de
en vue des
en vue du
es
est
et
étaient
étais
était
etant
étant
étante
étantes
étants
etc
été
étée
étées
êtes
étés
étiez
étions
et/ou
etre
être
eu
eue
eu égard à
eu égard au
eu égard aux
eues
euh
eûmes
eurent
eus
eusse
eussent
eusses
eussiez
eussions
eut
eût
eûtes
eux
eux-mêmes
excepté
façon
fais
faisaient
faisant
fait
faux
feront
fev
fevrier
février
flac
floc
font
for
fors
fûmes
furent
fus
fusse
fussent
fusses
fussiez
fussions
fut
fût
fûtes
gens
ghz
grâce à
grâce au
grâce aux
han
hein
hélas
hem
hep
heu
hm³
holà
hop
hormis
hors
hou
houp
hue
hui
huit
huitième
hum
hurrah
ici
if
il
ils
importe
in
into
is
it
j
j'ai
jamais
jan
janvier
je
j'en
jeudi
j'eux
juil
juillet
juin
jusqu
jusqu'
jusqu'a
jusqu'au
jusqu'aux
jusque
juste
km²
l
la
la leur
la mienne
la nôtre
laquelle
las
la sienne
la tienne
l'autre
la vôtre
le
le leur
le mien
le nôtre
lequel
les
les autres
le sien
les leurs
les miennes
les miens
les nôtres
lesquelles
lesquels
les siennes
les siens
les tiennes
les tiens
les vôtres
le tien
leur
leurs
le vôtre
lez
l'on
longtemps
lors d'
lors de
lors des
lors du
lors même qu'
lors même que
lorsqu'
lorsque
lui
lui-même
l'un
lundi
l'une
m
ma
m'a
mai
maint
mainte
maintes
maints
mais
malgré
mar
mardi
mars
me
même
mêmes
m'en
merci
mercredi
mes
mgr
mhz
mien
mienne
miennes
miens
mil
mille
milliards
millions
mince
mm²
moi
moi-même
moins
mois
mon
moyennant
n
n'a
ne
néanmoins
n'est
neuf
neuvième
no
nombreuses
nombreux
non
nonante
nonobstant
nos
not
notre
nôtres
nous
nous-mêmes
nov
novembre
nul
nulle
oct
octante
octobre
of
ohé
olé
ollé
on
ont
onze
onzième
or
ore
ou
ouf
ouias
oust
ouste
outre
paf
pan
par
parbleu
parce qu'
parce que
par-delà
parfois
parmi
par rapport à
par rapport au
par rapport aux
partant
particulier
particulière
particulièrement
pas
pas grand-chose
passé
pendant
personne
peu
peut
peuvent
peux
pff
pfft
pfut
pif
plein
plouf
plus
plus d'un
plus d'une
plusieurs
plutôt
pouah
pour
pour qu'
pour que
pourquoi
pourtant
pourvu qu'
pourvu que
premier
première
premièrement
près
proche
psitt
puisqu'
puisque
qu
qu'
qu'à
quand
quant
quanta
quant à
quant-à-soi
quant au
quant aux
quarante
quarante-cinq
quarante-deux
quarante et un
quarante-et-un
quarante et une
quarante-huit
quarante-neuf
quarante-quatre
quarante-sept
quarante-six
quarante-trois
quatorze
quatre
quatre-vingt
quatre-vingt-cinq
quatre-vingt-deux
quatre-vingt-dix
quatre-vingt-dix-huit
quatre-vingt-dix-neuf
quatre-vingt-dix-sept
quatre-vingt-douze
quatre-vingt-huit
quatre-vingt-neuf
quatre-vingt-onze
quatre-vingt-quatorze
quatre-vingt-quatre
quatre-vingt-quinze
quatre-vingts
quatre-vingt-seize
quatre-vingt-sept
quatre-vingt-six
quatre-vingt-treize
quatre-vingt-trois
quatre-vingt-un
quatre-vingt-une
quatrième
quatrièmement
que
quel
quelconque
quelle
qu'elle
quelles
qu'elles
quelqu'
quelque
quelque chose
quelques
quelques-unes
quelques-uns
quelqu'un
quelqu'une
quels
qui
quiconque
qu'il
qu'ils
quinze
quoi
quoiqu'
quoique
qu'on
revoici
revoilà
rien
risque
s
sa
sacrebleu
s'agit
samedi
sans
sapristi
sauf
se
seize
selon
semaine
sep
sept
septante
septembre
septième
sera
serai
seraient
serais
serait
seras
serez
seriez
serions
serons
seront
ses
s'est
sien
sienne
siennes
siens
s'il
s'ils
sinon
six
sixième
soi
soient
soi-même
sois
soit
soixante
soixante-cinq
soixante-deux
soixante-dix
soixante-dix-huit
soixante-dix-neuf
soixante-dix-sept
soixante-douze
soixante et onze
soixante-et-onze
soixante et un
soixante-et-un
soixante-et-une
soixante-huit
soixante-neuf
soixante-quatorze
soixante-quatre
soixante-quinze
soixante-seize
soixante-sept
soixante-six
soixante-treize
soixante-trois
sommes
son
sont
sous
souvent
soyez
soyons
stop
such
suis
suite à
suivant
sur
surtout
sus
t
ta
tac
tacatac
tandis qu'
tandis que
tant
te
tel
telle
tellement
telles
tels
tenant
tes
that
the
their
then
there
these
they
this
tic
tien
tienne
tiennes
tiens
to
toc
toi
toi-même
ton
touchant
toujours
tous
tout
toute
toutefois
toutes
treize
trente
trente-cinq
trente-deux
trente et un
trente-et-un
trente et une
trente-huit
trente-neuf
trente-quatre
trente-sept
trente-six
trente-trois
très
trois
troisième
troisièmement
trop
tsoin
tsouin
tu
un
une
unes
uns
usd
vais
vas
vendredi
vers
via
vif
vifs
vingt
vingt-cinq
vingt-deux
vingt et un
vingt et une
vingt-huit
vingt-neuf
vingt-quatre
vingt-sept
vingt-six
vingt-trois
vis-à-vis
vivat
vive
vives
vlan
voici
voilà
vont
vos
votre
vôtres
vous
vous-mêmes
vrai
was
will
with
y
zéro
zut
ngram/group.py
View file @
c4f45aea
...
@@ -7,6 +7,7 @@ from gargantext_web.db import NodeNgram,NodeNodeNgram
...
@@ -7,6 +7,7 @@ from gargantext_web.db import NodeNgram,NodeNodeNgram
from
gargantext_web.db
import
*
from
gargantext_web.db
import
*
from
gargantext_web.db
import
get_or_create_node
from
gargantext_web.db
import
get_or_create_node
from
analysis.lists
import
Translations
,
UnweightedList
from
parsing.corpustools
import
*
from
parsing.corpustools
import
*
import
sqlalchemy
as
sa
import
sqlalchemy
as
sa
...
@@ -21,62 +22,7 @@ from collections import defaultdict
...
@@ -21,62 +22,7 @@ from collections import defaultdict
from
math
import
log
from
math
import
log
from
functools
import
reduce
from
functools
import
reduce
def
queryNodeNodeNgram
(
nodeMeasure_id
=
None
,
corpus_id
=
None
,
limit
=
None
):
'''
queryNodeNodeNgram :: Int -> Int -> Int -> (Int, String, Float)
Get list of ngrams according to a measure related to the corpus: maybe tfidf
cvalue.
'''
query
=
(
session
.
query
(
Ngram
.
id
,
Ngram
.
terms
,
NodeNodeNgram
.
score
)
.
join
(
NodeNodeNgram
,
NodeNodeNgram
.
ngram_id
==
Ngram
.
id
)
.
join
(
Node
,
Node
.
id
==
NodeNodeNgram
.
nodex_id
)
.
filter
(
NodeNodeNgram
.
nodex_id
==
nodeMeasure_id
)
.
filter
(
NodeNodeNgram
.
nodey_id
==
corpus_id
)
.
group_by
(
Ngram
.
id
,
Ngram
.
terms
,
NodeNodeNgram
.
score
)
.
order_by
(
desc
(
NodeNodeNgram
.
score
))
)
if
limit
is
None
:
query
=
query
.
count
()
elif
limit
==
0
:
query
=
query
.
all
()
else
:
query
=
query
.
limit
(
limit
)
return
(
query
)
def
getNgrams
(
corpus
=
None
,
limit_inf
=
600
,
limit_sup
=
3000
):
'''
getNgrams :: Corpus -> [(Int, String)] -> [(Int, String)]
For a corpus, gives list of highest Cvalue ngrams and highest TFIDF (global)
ngrams that have to be grouped with
'''
#tfidf_node = get_or_create_node(nodetype='Tfidf (global)', corpus=corpus)
cvalue_node
=
get_or_create_node
(
nodetype
=
'Cvalue'
,
corpus
=
corpus
)
spec_node
=
get_or_create_node
(
nodetype
=
'Specificity'
,
corpus
=
corpus
)
#tfidf_ngrams = queryNodeNodeNgram(nodeMeasure_id=tfidf_node.id, corpus_id=corpus.id)
cvalue_ngrams
=
queryNodeNodeNgram
(
nodeMeasure_id
=
cvalue_node
.
id
,
corpus_id
=
corpus
.
id
,
limit
=
limit_sup
)
spec_ngrams
=
queryNodeNodeNgram
(
nodeMeasure_id
=
spec_node
.
id
,
corpus_id
=
corpus
.
id
,
limit
=
limit_inf
)
#print([n for n in tfidf_ngrams])
def
list2set
(
_list
):
_set
=
set
()
for
n
in
_list
:
_set
.
add
((
n
[
0
],
n
[
1
]))
return
(
_set
)
cvalue_set
=
set
()
spec_set
=
set
()
cvalue_set
=
list2set
(
cvalue_ngrams
)
spec_set
=
list2set
(
spec_ngrams
)
cvalue_setDiff
=
cvalue_set
.
difference
(
spec_set
)
return
(
spec_set
,
cvalue_setDiff
)
def
getStemmer
(
corpus
):
def
getStemmer
(
corpus
):
'''
'''
...
@@ -121,10 +67,16 @@ def compute_groups(corpus, limit_inf=None, limit_sup=None, how='Stem'):
...
@@ -121,10 +67,16 @@ def compute_groups(corpus, limit_inf=None, limit_sup=None, how='Stem'):
miam_to_insert
=
set
()
miam_to_insert
=
set
()
miam_node
=
get_or_create_node
(
nodetype
=
'MiamList'
,
corpus
=
corpus
)
miam_node
=
get_or_create_node
(
nodetype
=
'MiamList'
,
corpus
=
corpus
)
stop_node
=
get_or_create_node
(
nodetype
=
'StopList'
,
corpus
=
corpus
)
#stop_list = UnweightedList(stop_node.id)
Stop
=
aliased
(
NodeNgram
)
frequency
=
sa
.
func
.
count
(
NodeNgram
.
weight
)
frequency
=
sa
.
func
.
count
(
NodeNgram
.
weight
)
ngrams
=
(
session
.
query
(
Ngram
.
id
,
Ngram
.
terms
,
frequency
)
ngrams
=
(
session
.
query
(
Ngram
.
id
,
Ngram
.
terms
,
frequency
)
.
join
(
NodeNgram
,
NodeNgram
.
ngram_id
==
Ngram
.
id
)
.
join
(
NodeNgram
,
NodeNgram
.
ngram_id
==
Ngram
.
id
)
.
join
(
Node
,
Node
.
id
==
NodeNgram
.
node_id
)
.
join
(
Node
,
Node
.
id
==
NodeNgram
.
node_id
)
#.outerjoin(Stop, Stop.ngram_id == Ngram.id)
#.filter(Stop.node_id == stop_node.id, Stop.ngram_id == None)
.
filter
(
Node
.
parent_id
==
corpus
.
id
,
Node
.
type_id
==
cache
.
NodeType
[
'Document'
]
.
id
)
.
filter
(
Node
.
parent_id
==
corpus
.
id
,
Node
.
type_id
==
cache
.
NodeType
[
'Document'
]
.
id
)
.
group_by
(
Ngram
.
id
)
.
group_by
(
Ngram
.
id
)
.
order_by
(
desc
(
frequency
))
.
order_by
(
desc
(
frequency
))
...
@@ -132,6 +84,18 @@ def compute_groups(corpus, limit_inf=None, limit_sup=None, how='Stem'):
...
@@ -132,6 +84,18 @@ def compute_groups(corpus, limit_inf=None, limit_sup=None, how='Stem'):
.
limit
(
limit_sup
)
.
limit
(
limit_sup
)
)
)
stops
=
(
session
.
query
(
Ngram
.
id
,
Ngram
.
terms
,
frequency
)
.
join
(
NodeNgram
,
NodeNgram
.
ngram_id
==
Ngram
.
id
)
.
join
(
Node
,
Node
.
id
==
NodeNgram
.
node_id
)
.
join
(
Stop
,
Stop
.
ngram_id
==
Ngram
.
id
)
.
filter
(
Stop
.
node_id
==
stop_node
.
id
)
.
filter
(
Node
.
parent_id
==
corpus
.
id
,
Node
.
type_id
==
cache
.
NodeType
[
'Document'
]
.
id
)
.
group_by
(
Ngram
.
id
)
.
all
()
)
ngrams
=
[
n
for
n
in
ngrams
if
n
not
in
stops
]
print
(
ngrams
)
#group = defaultdict(lambda : defaultdict())
#group = defaultdict(lambda : defaultdict())
ids_dict
=
dict
()
ids_dict
=
dict
()
mainform_dict
=
dict
()
mainform_dict
=
dict
()
...
...
ngram/m
iam
.py
→
ngram/m
apList
.py
View file @
c4f45aea
# Without this, we couldn't use the Django environment
# Without this, we couldn't use the Django environment
#
from admin.env import *
from
admin.env
import
*
#from ngram.stemLem import *
#from ngram.stemLem import *
from
admin.utils
import
PrintException
,
DebugTime
from
admin.utils
import
PrintException
,
DebugTime
...
@@ -15,42 +15,51 @@ from sqlalchemy.orm import aliased
...
@@ -15,42 +15,51 @@ from sqlalchemy.orm import aliased
from
ngram.tools
import
insert_ngrams
from
ngram.tools
import
insert_ngrams
import
csv
import
csv
def
compute_m
iam
(
corpus
,
limit
=
500
):
def
compute_m
apList
(
corpus
,
limit
=
500
):
'''
'''
According to Specificities and stoplist,
According to Specificities and stoplist,
'''
'''
dbg
=
DebugTime
(
'Corpus #
%
d - computing Miam'
%
corpus
.
id
)
dbg
=
DebugTime
(
'Corpus #
%
d - computing Miam'
%
corpus
.
id
)
node_
group
=
get_or_create_node
(
nodetype
=
'Group
'
,
corpus
=
corpus
)
node_
miam
=
get_or_create_node
(
nodetype
=
'MiamList
'
,
corpus
=
corpus
)
node_stop
=
get_or_create_node
(
nodetype
=
'StopList'
,
corpus
=
corpus
)
node_stop
=
get_or_create_node
(
nodetype
=
'StopList'
,
corpus
=
corpus
)
node_group
=
get_or_create_node
(
nodetype
=
'Group'
,
corpus
=
corpus
)
node_spec
=
get_or_create_node
(
nodetype
=
'Specificity'
,
corpus
=
corpus
)
node_spec
=
get_or_create_node
(
nodetype
=
'Specificity'
,
corpus
=
corpus
)
Miam
=
aliased
(
NodeNgram
)
Stop
=
aliased
(
NodeNgram
)
Stop
=
aliased
(
NodeNgram
)
Group
=
aliased
(
NodeNgramNgram
)
Group
=
aliased
(
NodeNgramNgram
)
Spec
=
aliased
(
NodeNodeNgram
)
Spec
=
aliased
(
NodeNodeNgram
)
top_miam
=
(
session
.
query
(
Spec
.
ngram_id
,
Spec
.
score
)
top_ngrams
=
(
session
.
query
(
Spec
.
ngram_id
,
Spec
.
score
)
.
outerjoin
(
Group
,
Group
.
ngramy_id
==
Spec
.
ngram_id
)
.
join
(
Miam
,
Spec
.
ngram_id
==
Miam
.
ngram_id
)
.
outerjoin
(
Stop
,
Stop
.
ngram_id
==
Spec
.
ngram_id
)
#.outerjoin(Group, Group.ngramy_id == Spec.ngram_id)
.
filter
(
Group
.
node_id
==
node_group
.
id
)
#.outerjoin(Stop, Stop.ngram_id == Spec.ngram_id)
.
filter
(
Stop
.
node_id
==
node_stop
.
id
)
.
filter
(
Miam
.
node_id
==
node_miam
.
id
)
#.filter(Group.node_id == node_group.id)
#.filter(Stop.node_id == node_stop.id)
.
filter
(
Spec
.
nodex_id
==
node_spec
.
id
)
.
order_by
(
desc
(
Spec
.
score
))
.
order_by
(
desc
(
Spec
.
score
))
.
limit
(
limit
)
.
limit
(
limit
)
)
)
print
([
t
for
t
in
top_miam
])
node_miam
=
get_or_create_node
(
nodetype
=
'MiamList'
,
corpus
=
corpus
)
#print([t for t in top_ngrams])
session
.
query
(
NodeNgram
)
.
filter
(
NodeNgram
.
node_id
==
node_miam
.
id
)
.
delete
()
node_mapList
=
get_or_create_node
(
nodetype
=
'MapList'
,
corpus
=
corpus
)
session
.
query
(
NodeNgram
)
.
filter
(
NodeNgram
.
node_id
==
node_mapList
.
id
)
.
delete
()
session
.
commit
()
session
.
commit
()
data
=
zip
(
data
=
zip
(
[
node_m
iam
.
id
for
i
in
range
(
1
,
limit
)]
[
node_m
apList
.
id
for
i
in
range
(
1
,
limit
)]
,
[
n
[
0
]
for
n
in
top_
miam
]
,
[
n
[
0
]
for
n
in
top_
ngrams
]
,
[
1
for
i
in
range
(
1
,
limit
)]
,
[
1
for
i
in
range
(
1
,
limit
)]
)
)
print
([
d
for
d
in
data
])
#
print([d for d in data])
bulk_insert
(
NodeNgram
,
[
'node_id'
,
'ngram_id'
,
'weight'
],
[
d
for
d
in
data
])
bulk_insert
(
NodeNgram
,
[
'node_id'
,
'ngram_id'
,
'weight'
],
[
d
for
d
in
data
])
dbg
.
show
(
'M
iam
computed'
)
dbg
.
show
(
'M
apList
computed'
)
def
insert_miam
(
corpus
,
ngrams
=
None
,
path_file_csv
=
None
):
def
insert_miam
(
corpus
,
ngrams
=
None
,
path_file_csv
=
None
):
dbg
=
DebugTime
(
'Corpus #
%
d - computing Miam'
%
corpus
.
id
)
dbg
=
DebugTime
(
'Corpus #
%
d - computing Miam'
%
corpus
.
id
)
...
@@ -87,8 +96,41 @@ def insert_miam(corpus, ngrams=None, path_file_csv=None):
...
@@ -87,8 +96,41 @@ def insert_miam(corpus, ngrams=None, path_file_csv=None):
file_csv
.
close
()
file_csv
.
close
()
dbg
.
show
(
'Miam computed'
)
dbg
.
show
(
'Miam computed'
)
#corpus = session.query(Node).filter(Node.id==556113).first()
#corpus = session.query(Node).filter(Node.id==540420).first()
#compute_mapList(corpus)
#insert_miam(corpus=corpus, path_file_csv="Thesaurus_tag.csv")
#insert_miam(corpus=corpus, path_file_csv="Thesaurus_tag.csv")
#def getNgrams(corpus=None, limit_inf=600, limit_sup=3000):
# '''
# getNgrams :: Corpus -> [(Int, String)] -> [(Int, String)]
# For a corpus, gives list of highest Cvalue ngrams and highest TFIDF (global)
# ngrams that have to be grouped with
# '''
# #tfidf_node = get_or_create_node(nodetype='Tfidf (global)', corpus=corpus)
# cvalue_node = get_or_create_node(nodetype='Cvalue', corpus=corpus)
# spec_node = get_or_create_node(nodetype='Specificity', corpus=corpus)
#
#
# #tfidf_ngrams = queryNodeNodeNgram(nodeMeasure_id=tfidf_node.id, corpus_id=corpus.id)
# cvalue_ngrams = queryNodeNodeNgram(nodeMeasure_id=cvalue_node.id, corpus_id=corpus.id, limit=limit_sup)
# spec_ngrams = queryNodeNodeNgram(nodeMeasure_id=spec_node.id, corpus_id=corpus.id, limit=limit_inf)
#
# #print([n for n in tfidf_ngrams])
#
# def list2set(_list):
# _set = set()
# for n in _list:
# _set.add((n[0],n[1]))
# return(_set)
#
# cvalue_set = set()
# spec_set = set()
#
# cvalue_set = list2set(cvalue_ngrams)
# spec_set = list2set(spec_ngrams)
#
# cvalue_setDiff = cvalue_set.difference(spec_set)
#
# return(spec_set,cvalue_setDiff)
#
ngram/stop.py
View file @
c4f45aea
# Without this, we couldn't use the Django environment
#from admin.env import *
#from ngram.stemLem import *
import
re
import
re
from
admin.utils
import
PrintException
from
admin.utils
import
PrintException
from
gargantext_web.db
import
NodeNgram
,
NodeNodeNgram
from
gargantext_web.db
import
Node
,
Ngram
,
Node
Ngram
,
NodeNodeNgram
from
gargantext_web.db
import
cache
,
session
,
get_or_create_node
from
gargantext_web.db
import
cache
,
session
,
get_or_create_node
,
bulk_insert
import
sqlalchemy
as
sa
from
sqlalchemy.sql
import
func
from
sqlalchemy.sql
import
func
from
sqlalchemy
import
desc
,
asc
,
or_
,
and_
,
Date
,
cast
,
select
from
sqlalchemy
import
desc
,
asc
,
or_
,
and_
,
Date
,
cast
,
select
from
sqlalchemy
import
literal_column
from
sqlalchemy
import
literal_column
...
@@ -38,7 +35,6 @@ def importStopList(node,filename,language='fr'):
...
@@ -38,7 +35,6 @@ def importStopList(node,filename,language='fr'):
bulk_insert
(
NodeNgram
,
[
'node_id'
,
'ngram_id'
,
'weight'
],
[
d
for
d
in
data
])
bulk_insert
(
NodeNgram
,
[
'node_id'
,
'ngram_id'
,
'weight'
],
[
d
for
d
in
data
])
def
isStopWord
(
ngram
,
stop_words
=
None
):
def
isStopWord
(
ngram
,
stop_words
=
None
):
'''
'''
ngram :: (Int, String) => (ngram_id, ngram_terms)
ngram :: (Int, String) => (ngram_id, ngram_terms)
...
@@ -55,8 +51,9 @@ def isStopWord(ngram, stop_words=None):
...
@@ -55,8 +51,9 @@ def isStopWord(ngram, stop_words=None):
if
format_regex
.
match
(
word
)
:
if
format_regex
.
match
(
word
)
:
return
(
True
)
return
(
True
)
for
regex
in
[
"(.*)
\
d(.*)"
for
regex
in
[
,
"^.{1,2}$"
"^.{1,2}$"
,
"(.*)
\
d(.*)"
,
"(.*)(
\
.)(.*)"
,
"(.*)(
\
.)(.*)"
,
"(.*)(
\
,)(.*)"
,
"(.*)(
\
,)(.*)"
,
"(.*)(study)(.*)"
,
"(.*)(study)(.*)"
...
@@ -73,13 +70,11 @@ def isStopWord(ngram, stop_words=None):
...
@@ -73,13 +70,11 @@ def isStopWord(ngram, stop_words=None):
if
test_match
(
word
,
regex
)
is
True
:
if
test_match
(
word
,
regex
)
is
True
:
return
(
True
)
return
(
True
)
def
compute_stop
(
corpus
,
limit
=
2000
,
debug
=
False
):
def
compute_stop
(
corpus
,
size
=
2000
,
debug
=
False
):
'''
'''
do some statitics on all stop lists of database of the same type
do some statitics on all stop lists of database of the same type
'''
'''
stop_node
=
get_or_create_node
(
nodetype
=
'StopList'
,
corpus
=
corpus
)
stop_node
=
get_or_create_node
(
nodetype
=
'StopList'
,
corpus
=
corpus
)
miam_node
=
get_or_create_node
(
nodetype
=
'MiamList'
,
corpus
=
corpus
)
# TODO do a function to get all stop words with social scores
# TODO do a function to get all stop words with social scores
root
=
session
.
query
(
Node
)
.
filter
(
Node
.
type_id
==
cache
.
NodeType
[
'Root'
]
.
id
)
.
first
()
root
=
session
.
query
(
Node
)
.
filter
(
Node
.
type_id
==
cache
.
NodeType
[
'Root'
]
.
id
)
.
first
()
...
@@ -91,33 +86,25 @@ def compute_stop(corpus,size=2000,debug=False):
...
@@ -91,33 +86,25 @@ def compute_stop(corpus,size=2000,debug=False):
.
all
()
.
all
()
)
)
top_words
=
(
session
.
query
(
Ngram
.
id
,
Ngram
.
terms
)
#print([n for n in stop_words])
.
join
(
NodeNgram
,
NodeNgram
.
ngram_id
==
Ngram
.
id
)
.
filter
(
NodeNgram
.
node_id
==
miam_node
.
id
)
.
order_by
(
desc
(
NodeNgram
.
weight
))
.
limit
(
size
)
)
ngrams_to_stop
=
filter
(
lambda
x
:
isStopWord
(
x
,
stop_words
=
stop_words
),
top_words
)
stop
=
WeightedList
({
n
[
0
]
:
-
1
for
n
in
ngrams_to_stop
})
frequency
=
sa
.
func
.
count
(
NodeNgram
.
weight
)
stop
.
save
(
stop_node
.
id
)
ngrams
=
(
session
.
query
(
Ngram
.
id
,
Ngram
.
terms
,
frequency
)
.
join
(
NodeNgram
,
NodeNgram
.
ngram_id
==
Ngram
.
id
)
miam
=
UnweightedList
(
miam_node
.
id
)
.
join
(
Node
,
Node
.
id
==
NodeNgram
.
node_id
)
.
filter
(
Node
.
parent_id
==
corpus
.
id
,
new_miam
=
miam
-
stop
Node
.
type_id
==
cache
.
NodeType
[
'Document'
]
.
id
)
new_miam
.
save
(
miam_node
.
id
)
.
group_by
(
Ngram
.
id
)
.
order_by
(
desc
(
frequency
)
)
# data = zip(
.
all
()
# [stop_node.id for i in range(0,size)]
#.limit(limit)
# , [ngram[0] for ngram in ngrams_to_stop]
)
# , [-1 for i in range(0,size)]
# )
# bulk_insert(NodeNgram, ['node_id', 'ngram_id', 'weight'], [d for d in data])
#corpus=session.query(Node).filter(Node.id==545461).first()
ngrams_to_stop
=
filter
(
lambda
x
:
isStopWord
(
x
,
stop_words
=
stop_words
),
ngrams
)
#compute_stop(corpus)
#print([n for n in ngrams_to_stop])
stop
=
WeightedList
({
n
[
0
]
:
-
1
for
n
in
ngrams_to_stop
})
stop
.
save
(
stop_node
.
id
)
ngram/tools.py
View file @
c4f45aea
...
@@ -109,3 +109,30 @@ def insert_nodengramngram(nodengramngram):
...
@@ -109,3 +109,30 @@ def insert_nodengramngram(nodengramngram):
'''
%
(
NodeNgramNgram
.
__table__
.
name
,))
'''
%
(
NodeNgramNgram
.
__table__
.
name
,))
db
.
commit
()
db
.
commit
()
#def queryNodeNodeNgram(nodeMeasure_id=None, corpus_id=None, limit=None):
# '''
# queryNodeNodeNgram :: Int -> Int -> Int -> (Int, String, Float)
# Get list of ngrams according to a measure related to the corpus: maybe tfidf
# cvalue.
# '''
# query = (session.query(Ngram.id, Ngram.terms, NodeNodeNgram.score)
# .join(NodeNodeNgram, NodeNodeNgram.ngram_id == Ngram.id)
# .join(Node, Node.id == NodeNodeNgram.nodex_id)
# .filter(NodeNodeNgram.nodex_id == nodeMeasure_id)
# .filter(NodeNodeNgram.nodey_id == corpus_id)
# .group_by(Ngram.id, Ngram.terms, NodeNodeNgram.score)
# .order_by(desc(NodeNodeNgram.score))
# )
#
# if limit is None:
# query = query.count()
# elif limit == 0 :
# query = query.all()
# else:
# query = query.limit(limit)
#
# return(query)
#
ngram/workflow.py
View file @
c4f45aea
...
@@ -4,8 +4,9 @@ from ngram.cvalue import compute_cvalue
...
@@ -4,8 +4,9 @@ from ngram.cvalue import compute_cvalue
from
ngram.specificity
import
compute_specificity
from
ngram.specificity
import
compute_specificity
#from ngram.stop import compute_stop
#from ngram.stop import compute_stop
from
ngram.group
import
compute_groups
from
ngram.group
import
compute_groups
from
ngram.miam
import
compute_miam
from
gargantext_web.db
import
get_or_create_node
from
gargantext_web.db
import
get_or_create_node
from
ngram.mapList
import
compute_mapList
#from gargantext_web.celery import update_processing
#from gargantext_web.celery import update_processing
...
@@ -13,31 +14,32 @@ def ngram_workflow(corpus, n=5000):
...
@@ -13,31 +14,32 @@ def ngram_workflow(corpus, n=5000):
'''
'''
All the workflow to filter the ngrams.
All the workflow to filter the ngrams.
'''
'''
compute_tfidf_global
(
corpus
)
#
compute_tfidf_global(corpus)
part
=
round
(
n
*
0.
8
)
part
=
round
(
n
*
0.
9
)
compute_cvalue
(
corpus
,
limit
=
part
)
# size
#
compute_cvalue(corpus,limit=part) # size
part
=
round
(
part
*
0.
4
)
part
=
round
(
part
*
0.
8
)
print
(
'spec part:'
,
part
)
print
(
'spec part:'
,
part
)
compute_specificity
(
corpus
,
limit
=
part
)
#
compute_specificity(corpus,limit=part)
part
=
round
(
part
*
0.
5
)
part
=
round
(
part
*
0.
8
)
# compute_stop(corpus)
# compute_stop(corpus)
limit_inf
=
round
(
part
*
1
)
limit_inf
=
round
(
part
*
1
)
limit_sup
=
round
(
part
*
5
)
limit_sup
=
round
(
part
*
5
)
print
(
limit_inf
,
limit_sup
)
print
(
limit_inf
,
limit_sup
)
compute_groups
(
corpus
,
limit_inf
=
limit_inf
,
limit_sup
=
limit_sup
)
#
compute_groups(corpus,limit_inf=limit_inf, limit_sup=limit_sup)
# compute_miam
(corpus,limit=part) # size
compute_mapList
(
corpus
,
limit
=
part
)
# size
compute_tfidf
(
corpus
)
#
compute_tfidf(corpus)
#corpus=session.query(Node).filter(Node.id==257579).first()
#corpus=session.query(Node).filter(Node.id==540420).first()
#corpus=session.query(Node).filter(Node.id==559637).first()
#ngram_workflow(corpus)
#ngram_workflow(corpus)
#update_processing(corpus, 0)
#update_processing(corpus, 0)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment