Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
gargantext
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
humanities
gargantext
Commits
1160395d
Commit
1160395d
authored
Oct 22, 2015
by
PkSM3
Browse files
Options
Browse Files
Download
Plain Diff
[UPDATE] merge conflict resolved ?
parents
034dc5e2
a57a4185
Changes
13
Hide whitespace changes
Inline
Side-by-side
Showing
13 changed files
with
202 additions
and
53 deletions
+202
-53
cooccurrences.py
analysis/cooccurrences.py
+3
-3
functions.py
analysis/functions.py
+3
-4
urls.py
annotations/urls.py
+1
-1
views.py
annotations/views.py
+2
-0
celery.py
gargantext_web/celery.py
+1
-1
views.py
gargantext_web/views.py
+11
-3
init.py
init.py
+15
-0
group.py
ngram/group.py
+5
-3
miam.py
ngram/miam.py
+40
-0
stop.py
ngram/stop.py
+94
-21
tools.py
ngram/tools.py
+0
-3
corpustools.py
parsing/corpustools.py
+22
-12
graph.py
rest_v1_0/graph.py
+5
-2
No files found.
analysis/cooccurrences.py
View file @
1160395d
...
@@ -17,7 +17,8 @@ def do_cooc(corpus=None
...
@@ -17,7 +17,8 @@ def do_cooc(corpus=None
,
n_min
=
2
,
n_max
=
None
,
n_min
=
2
,
n_max
=
None
,
start
=
None
,
end
=
None
,
start
=
None
,
end
=
None
,
limit
=
1000
,
limit
=
1000
,
isMonopartite
=
True
):
,
isMonopartite
=
True
,
apax
=
2
):
'''
'''
Compute the cooccurence matrix and save it, returning NodeNgramNgram.node_id
Compute the cooccurence matrix and save it, returning NodeNgramNgram.node_id
For the moment list of paramters are not supported because, lists need to
For the moment list of paramters are not supported because, lists need to
...
@@ -149,8 +150,7 @@ def do_cooc(corpus=None
...
@@ -149,8 +150,7 @@ def do_cooc(corpus=None
# Cooc is symetric, take only the main cooccurrences and cut at the limit
# Cooc is symetric, take only the main cooccurrences and cut at the limit
cooc_query
=
cooc_query
.
filter
(
NodeNgramX
.
ngram_id
<
NodeNgramY
.
ngram_id
)
cooc_query
=
cooc_query
.
filter
(
NodeNgramX
.
ngram_id
<
NodeNgramY
.
ngram_id
)
cooc_query
=
cooc_query
.
having
(
cooc_score
>
1
)
cooc_query
=
cooc_query
.
having
(
cooc_score
>
apax
)
#.having(cooc_score > 1)
if
isMonopartite
:
if
isMonopartite
:
cooc_query
=
cooc_query
.
group_by
(
NodeNgramX
.
ngram_id
,
NodeNgramY
.
ngram_id
)
cooc_query
=
cooc_query
.
group_by
(
NodeNgramX
.
ngram_id
,
NodeNgramY
.
ngram_id
)
...
...
analysis/functions.py
View file @
1160395d
...
@@ -30,7 +30,7 @@ def diag_null(x):
...
@@ -30,7 +30,7 @@ def diag_null(x):
return
x
-
x
*
scipy
.
eye
(
x
.
shape
[
0
])
return
x
-
x
*
scipy
.
eye
(
x
.
shape
[
0
])
def
do_distance
(
cooc_id
,
field1
=
None
,
field2
=
None
,
isMonopartite
=
True
):
def
do_distance
(
cooc_id
,
field1
=
None
,
field2
=
None
,
isMonopartite
=
True
,
apax
=
2
):
'''
'''
do_distance :: Int -> (Graph, Partition, {ids}, {weight})
do_distance :: Int -> (Graph, Partition, {ids}, {weight})
'''
'''
...
@@ -119,6 +119,7 @@ def get_cooc(request=None, corpus=None
...
@@ -119,6 +119,7 @@ def get_cooc(request=None, corpus=None
,
field1
=
'ngrams'
,
field2
=
'ngrams'
,
field1
=
'ngrams'
,
field2
=
'ngrams'
,
cooc_id
=
None
,
type
=
'node_link'
,
size
=
1000
,
cooc_id
=
None
,
type
=
'node_link'
,
size
=
1000
,
start
=
None
,
end
=
None
,
start
=
None
,
end
=
None
,
apax
=
2
):
):
'''
'''
get_ccoc : to compute the graph.
get_ccoc : to compute the graph.
...
@@ -141,9 +142,7 @@ def get_cooc(request=None, corpus=None
...
@@ -141,9 +142,7 @@ def get_cooc(request=None, corpus=None
#cooc_id = get_or_create_node(nodetype='Cooccurrence', corpus=corpus).id
#cooc_id = get_or_create_node(nodetype='Cooccurrence', corpus=corpus).id
cooc_id
=
do_cooc
(
corpus
=
corpus
,
field1
=
"ngrams"
,
field2
=
"ngrams"
cooc_id
=
do_cooc
(
corpus
=
corpus
,
field1
=
"ngrams"
,
field2
=
"ngrams"
,
miam_id
=
miam_id
,
group_id
=
group_id
,
stop_id
=
stop_id
,
limit
=
size
,
miam_id
=
miam_id
,
group_id
=
group_id
,
stop_id
=
stop_id
,
limit
=
size
,
isMonopartite
=
isMonopartite
,
isMonopartite
=
isMonopartite
,
start
=
start
,
end
=
end
,
apax
=
apax
)
,
start
=
start
,
end
=
end
)
G
,
partition
,
ids
,
weight
=
do_distance
(
cooc_id
,
field1
=
"ngrams"
,
field2
=
"ngrams"
,
isMonopartite
=
isMonopartite
)
G
,
partition
,
ids
,
weight
=
do_distance
(
cooc_id
,
field1
=
"ngrams"
,
field2
=
"ngrams"
,
isMonopartite
=
isMonopartite
)
...
...
annotations/urls.py
View file @
1160395d
...
@@ -5,6 +5,6 @@ from annotations import views
...
@@ -5,6 +5,6 @@ from annotations import views
urlpatterns
=
patterns
(
''
,
urlpatterns
=
patterns
(
''
,
url
(
r'^document/(?P<doc_id>[0-9]+)$'
,
views
.
Document
.
as_view
()),
# document view
url
(
r'^document/(?P<doc_id>[0-9]+)$'
,
views
.
Document
.
as_view
()),
# document view
url
(
r'^corpus/(?P<corpus_id>[0-9]+)/document/(?P<doc_id>[0-9]+)$'
,
views
.
NgramList
.
as_view
()),
# the list associated with an ngram
url
(
r'^corpus/(?P<corpus_id>[0-9]+)/document/(?P<doc_id>[0-9]+)$'
,
views
.
NgramList
.
as_view
()),
# the list associated with an ngram
url
(
r'^lists/(?P<list_id>[0-9]+)/ngrams/(?P<ngram_ids>[0-9
]+\+*[0-9]*)
$'
,
views
.
NgramEdit
.
as_view
()),
url
(
r'^lists/(?P<list_id>[0-9]+)/ngrams/(?P<ngram_ids>[0-9
,\+]+)+
$'
,
views
.
NgramEdit
.
as_view
()),
url
(
r'^lists/(?P<list_id>[0-9]+)/ngrams/create$'
,
views
.
NgramCreate
.
as_view
()),
#
url
(
r'^lists/(?P<list_id>[0-9]+)/ngrams/create$'
,
views
.
NgramCreate
.
as_view
()),
#
)
)
annotations/views.py
View file @
1160395d
...
@@ -93,7 +93,9 @@ class NgramEdit(APIView):
...
@@ -93,7 +93,9 @@ class NgramEdit(APIView):
"""
"""
Delete a ngram from a list
Delete a ngram from a list
"""
"""
print
(
ngram_ids
)
for
ngram_id
in
ngram_ids
.
split
(
'+'
):
for
ngram_id
in
ngram_ids
.
split
(
'+'
):
print
(
'ngram_id'
,
ngram_id
)
ngram_id
=
int
(
ngram_id
)
ngram_id
=
int
(
ngram_id
)
(
session
.
query
(
NodeNgram
)
(
session
.
query
(
NodeNgram
)
.
filter
(
NodeNgram
.
node_id
==
list_id
)
.
filter
(
NodeNgram
.
node_id
==
list_id
)
...
...
gargantext_web/celery.py
View file @
1160395d
...
@@ -42,7 +42,7 @@ def apply_workflow(corpus_id):
...
@@ -42,7 +42,7 @@ def apply_workflow(corpus_id):
parse_resources
(
corpus
)
parse_resources
(
corpus
)
update_processing
(
corpus
,
2
)
update_processing
(
corpus
,
2
)
extract_ngrams
(
corpus
,
[
'title'
,
'abstract'
])
extract_ngrams
(
corpus
,
[
'title'
,
'abstract'
]
,
nlp
=
True
)
update_processing
(
corpus
,
3
)
update_processing
(
corpus
,
3
)
ngram_workflow
(
corpus
)
ngram_workflow
(
corpus
)
...
...
gargantext_web/views.py
View file @
1160395d
...
@@ -699,9 +699,17 @@ def sankey_csv(request, corpus_id):
...
@@ -699,9 +699,17 @@ def sankey_csv(request, corpus_id):
corpus
=
session
.
query
(
Node
)
.
filter
(
Node
.
id
==
corpus_id
)
.
first
()
corpus
=
session
.
query
(
Node
)
.
filter
(
Node
.
id
==
corpus_id
)
.
first
()
data
=
[
data
=
[
[
"source"
,
"target"
,
"value"
]
[
"source"
,
"target"
,
"value"
]
,
[
"Elvis_1"
,
"Elvis_2"
,
1
]
,
[
"Comment_1"
,
"Theme_1"
,
1
]
,
[
"Elvis_2"
,
"Elvis_3"
,
2
]
,
[
"Barry"
,
"Elvis_3"
,
2
]
,
[
"Comment_2"
,
"Theme_2"
,
2
]
,
[
"Comment_3"
,
"Theme_2"
,
2
]
,
[
"Comment_7"
,
"Theme_1"
,
2
]
,
[
"Comment_8"
,
"Theme_3"
,
2
]
,
[
"Theme_1"
,
"Reco_par_1"
,
2
]
,
[
"Theme_2"
,
"Reco_par_2"
,
2
]
,
[
"Theme_2"
,
"Reco_par_5"
,
2
]
,
[
"Theme_3"
,
"Reco_par_5"
,
1
]
]
]
return
(
CsvHttpResponse
(
data
))
return
(
CsvHttpResponse
(
data
))
...
...
init.py
View file @
1160395d
...
@@ -151,4 +151,19 @@ session.commit()
...
@@ -151,4 +151,19 @@ session.commit()
###f.close()
###f.close()
##
##
##
##
from
ngram.stop
import
importStopList
root
=
session
.
query
(
Node
)
.
filter
(
Node
.
type_id
==
cache
.
NodeType
[
'Root'
]
.
id
)
.
first
()
importStopList
(
root
,
'/srv/gargantext/init/stop_lists/fr.txt'
,
'fr'
)
importStopList
(
root
,
'/srv/gargantext/init/stop_lists/en.txt'
,
'en'
)
root
=
session
.
query
(
Node
)
.
filter
(
Node
.
type_id
==
cache
.
NodeType
[
'Root'
]
.
id
)
.
first
()
#importStopList(root, '/srv/gargantext/init/stop_lists/fr.txt', 'fr')
importStopList
(
root
,
'/srv/gargantext/init/stop_lists/en.txt'
,
'en'
)
#exit()
#exit()
ngram/group.py
View file @
1160395d
...
@@ -60,15 +60,17 @@ def getNgrams(corpus=None, limit_inf=600, limit_sup=3000):
...
@@ -60,15 +60,17 @@ def getNgrams(corpus=None, limit_inf=600, limit_sup=3000):
#print([n for n in tfidf_ngrams])
#print([n for n in tfidf_ngrams])
def
list2set
(
_list
,
_set
):
def
list2set
(
_list
):
_set
=
set
()
for
n
in
_list
:
for
n
in
_list
:
_set
.
add
((
n
[
0
],
n
[
1
]))
_set
.
add
((
n
[
0
],
n
[
1
]))
return
(
_set
)
cvalue_set
=
set
()
cvalue_set
=
set
()
spec_set
=
set
()
spec_set
=
set
()
list2set
(
cvalue_ngrams
,
cvalue_set
)
cvalue_set
=
list2set
(
cvalue_ngrams
)
list2set
(
spec_ngrams
,
spec_set
)
spec_set
=
list2set
(
spec_ngrams
)
cvalue_setDiff
=
cvalue_set
.
difference
(
spec_set
)
cvalue_setDiff
=
cvalue_set
.
difference
(
spec_set
)
...
...
ngram/miam.py
View file @
1160395d
...
@@ -12,6 +12,8 @@ from sqlalchemy import desc, asc, or_, and_, Date, cast, select
...
@@ -12,6 +12,8 @@ from sqlalchemy import desc, asc, or_, and_, Date, cast, select
from
sqlalchemy
import
literal_column
from
sqlalchemy
import
literal_column
from
sqlalchemy.orm
import
aliased
from
sqlalchemy.orm
import
aliased
from
ngram.tools
import
insert_ngrams
import
csv
def
compute_miam
(
corpus
,
limit
=
500
):
def
compute_miam
(
corpus
,
limit
=
500
):
'''
'''
...
@@ -50,5 +52,43 @@ def compute_miam(corpus,limit=500):
...
@@ -50,5 +52,43 @@ def compute_miam(corpus,limit=500):
dbg
.
show
(
'Miam computed'
)
dbg
.
show
(
'Miam computed'
)
def
insert_miam
(
corpus
,
ngrams
=
None
,
path_file_csv
=
None
):
dbg
=
DebugTime
(
'Corpus #
%
d - computing Miam'
%
corpus
.
id
)
node_miam
=
get_or_create_node
(
nodetype
=
'MiamList'
,
corpus
=
corpus
)
session
.
query
(
NodeNgram
)
.
filter
(
NodeNgram
.
node_id
==
node_miam
.
id
)
.
delete
()
session
.
commit
()
stop_words
=
set
()
miam_words
=
set
()
if
path_file_csv
is
not
None
:
file_csv
=
open
(
path_file_csv
,
"r"
)
reader
=
csv
.
reader
(
file_csv
,
delimiter
=
','
)
for
line
in
reader
:
word
=
line
[
0
]
tag
=
line
[
4
]
if
tag
==
'1'
:
miam_words
.
add
((
word
,
1
))
elif
tag
==
'0'
:
stop_words
.
add
((
word
,
1
))
miam_ids
=
insert_ngrams
(
miam_words
)
print
(
miam_ids
)
limit
=
len
(
list
(
miam_words
))
data
=
zip
(
[
node_miam
.
id
for
i
in
range
(
1
,
limit
)]
,
[
miam_ids
[
n
]
for
n
in
miam_ids
.
keys
()]
,
[
1
for
i
in
range
(
1
,
limit
)]
)
#print([d for d in data])
bulk_insert
(
NodeNgram
,
[
'node_id'
,
'ngram_id'
,
'weight'
],
[
d
for
d
in
data
])
file_csv
.
close
()
dbg
.
show
(
'Miam computed'
)
#corpus = session.query(Node).filter(Node.id==556113).first()
#insert_miam(corpus=corpus, path_file_csv="Thesaurus_tag.csv")
ngram/stop.py
View file @
1160395d
...
@@ -2,49 +2,122 @@
...
@@ -2,49 +2,122 @@
#from admin.env import *
#from admin.env import *
#from ngram.stemLem import *
#from ngram.stemLem import *
import
re
from
admin.utils
import
PrintException
from
admin.utils
import
PrintException
from
gargantext_web.db
import
NodeNgram
,
NodeNodeNgram
from
gargantext_web.db
import
NodeNgram
,
NodeNodeNgram
from
gargantext_web.db
import
get_or_create_node
,
session
from
gargantext_web.db
import
cache
,
session
,
get_or_create_node
from
sqlalchemy.sql
import
func
from
sqlalchemy.sql
import
func
from
sqlalchemy
import
desc
,
asc
,
or_
,
and_
,
Date
,
cast
,
select
from
sqlalchemy
import
desc
,
asc
,
or_
,
and_
,
Date
,
cast
,
select
from
sqlalchemy
import
literal_column
from
sqlalchemy
import
literal_column
from
sqlalchemy.orm
import
aliased
from
sqlalchemy.orm
import
aliased
from
ngram.tools
import
insert_ngrams
from
analysis.lists
import
WeightedList
,
UnweightedList
def
importStopList
(
node
,
filename
,
language
=
'fr'
):
with
open
(
filename
,
"r"
)
as
f
:
stop_list
=
f
.
read
()
.
splitlines
()
stop_words
=
set
(
stop_list
)
stop_ids
=
insert_ngrams
([(
word
,
len
(
word
.
split
(
' '
)))
for
word
in
stop_words
])
stop_node
=
get_or_create_node
(
nodetype
=
'StopList'
,
corpus
=
node
)
stop_node
.
language_id
=
cache
.
Language
[
language
]
.
id
session
.
add
(
stop_node
)
session
.
commit
()
size
=
len
(
list
(
stop_words
))
def
computeStop
(
corpus
,
size
=
100
):
data
=
zip
(
[
stop_node
.
id
for
i
in
range
(
0
,
size
)]
,
[
stop_ids
[
word
]
for
word
in
list
(
stop_words
)]
,
[
-
1
for
i
in
range
(
0
,
size
)]
)
bulk_insert
(
NodeNgram
,
[
'node_id'
,
'ngram_id'
,
'weight'
],
[
d
for
d
in
data
])
def
isStopWord
(
ngram
,
stop_words
=
None
):
'''
ngram :: (Int, String) => (ngram_id, ngram_terms)
stop_words :: Set of String
(to avoid SQL query each time isStopWord is invoked, get in as parameter)
'''
word
=
ngram
[
1
]
if
word
in
stop_words
:
return
(
True
)
def
test_match
(
word
,
regex
):
format_regex
=
re
.
compile
(
regex
)
if
format_regex
.
match
(
word
)
:
return
(
True
)
for
regex
in
[
"(.*)
\
d(.*)"
,
"^.{1,2}$"
,
"(.*)(
\
.)(.*)"
,
"(.*)(
\
,)(.*)"
,
"(.*)(study)(.*)"
,
"(.*)(result)(.*)"
,
"(.*)(année)(.*)"
,
"(.*)(temps)(.*)"
,
"(.*)(
%
)(.*)"
,
"(.*)(
\
{)(.*)"
,
"(.*)(terme)(.*)"
,
"(.*)(différent)(.*)"
,
"(.*)(travers)(.*)"
,
"(.*)(:|
\
|)(.*)"
]
:
if
test_match
(
word
,
regex
)
is
True
:
return
(
True
)
def
compute_stop
(
corpus
,
size
=
2000
,
debug
=
False
):
'''
'''
do some statitics on all stop lists of database of the same type
do some statitics on all stop lists of database of the same type
'''
'''
node_stop
=
get_or_create_node
(
nodetype
=
'StopList'
,
corpus
=
corpus
)
stop_node
=
get_or_create_node
(
nodetype
=
'StopList'
,
corpus
=
corpus
)
Stop
=
aliased
(
NodeNgram
)
miam_node
=
get_or_create_node
(
nodetype
=
'MiamList'
,
corpus
=
corpus
)
top_spec
=
(
session
.
query
(
NodeNodeNgram
.
ngram_id
,
NodeNodeNgram
.
score
)
# TODO do a function to get all stop words with social scores
.
outerjoin
(
Stop
,
Stop
.
ngram_id
==
NodeNodeNgram
.
ngram_id
)
root
=
session
.
query
(
Node
)
.
filter
(
Node
.
type_id
==
cache
.
NodeType
[
'Root'
]
.
id
)
.
first
()
.
filter
(
NodeNodeNgram
.
nodex_id
==
node_spec
.
id
)
root_stop_id
=
get_or_create_node
(
nodetype
=
'StopList'
,
corpus
=
root
)
.
id
.
filter
(
Stop
.
node_id
==
node_stop
.
id
)
.
order_by
(
desc
(
NodeNodeNgram
.
score
))
stop_words
=
(
session
.
query
(
Ngram
.
terms
)
.
join
(
NodeNgram
,
NodeNgram
.
ngram_id
==
Ngram
.
id
)
.
filter
(
NodeNgram
.
node_id
==
root_stop_id
)
.
all
()
)
top_words
=
(
session
.
query
(
Ngram
.
id
,
Ngram
.
terms
)
.
join
(
NodeNgram
,
NodeNgram
.
ngram_id
==
Ngram
.
id
)
.
filter
(
NodeNgram
.
node_id
==
miam_node
.
id
)
.
order_by
(
desc
(
NodeNgram
.
weight
))
.
limit
(
size
)
.
limit
(
size
)
)
)
ngrams_to_stop
=
filter
(
lambda
x
:
isStopWord
(
x
,
stop_words
=
stop_words
),
top_words
)
stop
=
WeightedList
({
n
[
0
]
:
-
1
for
n
in
ngrams_to_stop
})
stop
.
save
(
stop_node
.
id
)
node_miam
=
get_or_create_node
(
nodetype
=
'MiamList'
,
corpus
=
corpus
)
miam
=
UnweightedList
(
miam_node
.
id
)
session
.
query
(
NodeNgram
)
.
filter
(
NodeNgram
.
node_id
==
node_miam
.
id
)
.
delete
()
data
=
zip
(
new_miam
=
miam
-
stop
[
node_miam
.
id
for
i
in
range
(
1
,
size
)]
new_miam
.
save
(
miam_node
.
id
)
,
[
1
for
i
in
range
(
1
,
size
)]
,
[
n
[
0
]
for
n
in
top_spec
]
)
#print([d for d in data])
bulk_insert
(
NodeNgram
,
[
'node_id'
,
'ngram_id'
,
'weight'
],
[
d
for
d
in
data
])
# data = zip(
# [stop_node.id for i in range(0,size)]
# , [ngram[0] for ngram in ngrams_to_stop]
# , [-1 for i in range(0,size)]
# )
# bulk_insert(NodeNgram, ['node_id', 'ngram_id', 'weight'], [d for d in data])
#corpus=session.query(Node).filter(Node.id==244250).first()
#computeMiam(corpus)
#corpus=session.query(Node).filter(Node.id==545461).first()
#compute_stop(corpus)
ngram/tools.py
View file @
1160395d
...
@@ -28,8 +28,6 @@ def insert_ngrams(ngrams,get='terms-id'):
...
@@ -28,8 +28,6 @@ def insert_ngrams(ngrams,get='terms-id'):
%
s AS ngram
%
s AS ngram
WHERE
WHERE
tmp__ngram.terms = ngram.terms
tmp__ngram.terms = ngram.terms
AND
tmp__ngram.n = ngram.n
'''
%
(
Ngram
.
__table__
.
name
,))
'''
%
(
Ngram
.
__table__
.
name
,))
cursor
.
execute
(
'''
cursor
.
execute
(
'''
...
@@ -67,7 +65,6 @@ def insert_ngrams(ngrams,get='terms-id'):
...
@@ -67,7 +65,6 @@ def insert_ngrams(ngrams,get='terms-id'):
db
.
commit
()
db
.
commit
()
return
(
ngram_ids
)
return
(
ngram_ids
)
def
insert_nodengramngram
(
nodengramngram
):
def
insert_nodengramngram
(
nodengramngram
):
db
,
cursor
=
get_cursor
()
db
,
cursor
=
get_cursor
()
...
...
parsing/corpustools.py
View file @
1160395d
...
@@ -177,6 +177,8 @@ def parse_resources(corpus, user=None, user_id=None):
...
@@ -177,6 +177,8 @@ def parse_resources(corpus, user=None, user_id=None):
# ngrams extraction
# ngrams extraction
from
.NgramsExtractors
import
EnglishNgramsExtractor
,
FrenchNgramsExtractor
,
NgramsExtractor
from
.NgramsExtractors
import
EnglishNgramsExtractor
,
FrenchNgramsExtractor
,
NgramsExtractor
from
nltk.tokenize
import
word_tokenize
,
wordpunct_tokenize
,
sent_tokenize
class
NgramsExtractors
(
defaultdict
):
class
NgramsExtractors
(
defaultdict
):
def
__init__
(
self
):
def
__init__
(
self
):
# English
# English
...
@@ -201,7 +203,7 @@ class NgramsExtractors(defaultdict):
...
@@ -201,7 +203,7 @@ class NgramsExtractors(defaultdict):
ngramsextractors
=
NgramsExtractors
()
ngramsextractors
=
NgramsExtractors
()
def
extract_ngrams
(
corpus
,
keys
):
def
extract_ngrams
(
corpus
,
keys
,
nlp
=
True
):
dbg
=
DebugTime
(
'Corpus #
%
d - ngrams'
%
corpus
.
id
)
dbg
=
DebugTime
(
'Corpus #
%
d - ngrams'
%
corpus
.
id
)
default_language_iso2
=
None
if
corpus
.
language_id
is
None
else
cache
.
Language
[
corpus
.
language_id
]
.
iso2
default_language_iso2
=
None
if
corpus
.
language_id
is
None
else
cache
.
Language
[
corpus
.
language_id
]
.
iso2
# query the hyperdata associated with the given keys
# query the hyperdata associated with the given keys
...
@@ -220,7 +222,7 @@ def extract_ngrams(corpus, keys):
...
@@ -220,7 +222,7 @@ def extract_ngrams(corpus, keys):
ngrams_data
=
set
()
ngrams_data
=
set
()
ngrams_language_data
=
set
()
ngrams_language_data
=
set
()
ngrams_tag_data
=
set
()
#
ngrams_tag_data = set()
node_ngram_list
=
defaultdict
(
lambda
:
defaultdict
(
int
))
node_ngram_list
=
defaultdict
(
lambda
:
defaultdict
(
int
))
for
nodeinfo
in
hyperdata_query
:
for
nodeinfo
in
hyperdata_query
:
...
@@ -237,17 +239,25 @@ def extract_ngrams(corpus, keys):
...
@@ -237,17 +239,25 @@ def extract_ngrams(corpus, keys):
ngramsextractor
=
ngramsextractors
[
language_iso2
]
ngramsextractor
=
ngramsextractors
[
language_iso2
]
for
text
in
nodeinfo
[
2
:]:
for
text
in
nodeinfo
[
2
:]:
if
text
is
not
None
and
len
(
text
):
if
text
is
not
None
and
len
(
text
):
ngrams
=
ngramsextractor
.
extract_ngrams
(
text
.
replace
(
"["
,
""
)
.
replace
(
"]"
,
""
))
if
nlp
==
True
:
ngrams
=
ngramsextractor
.
extract_ngrams
(
text
.
replace
(
"["
,
""
)
.
replace
(
"]"
,
""
))
else
:
ngrams
=
wordpunct_tokenize
(
text
.
lower
())
for
ngram
in
ngrams
:
for
ngram
in
ngrams
:
n
=
len
(
ngram
)
if
nlp
==
True
:
terms
=
' '
.
join
([
token
for
token
,
tag
in
ngram
])
.
lower
()
n
=
len
(
ngram
)
terms
=
' '
.
join
([
token
for
token
,
tag
in
ngram
])
.
lower
()
else
:
terms
=
ngram
n
=
1
# TODO BUG here
# TODO BUG here
if
n
==
1
:
#
if n == 1:
#tag_id = cache.Tag[ngram[0][1]].id
#tag_id = cache.Tag[ngram[0][1]].id
tag_id
=
1
#
tag_id = 1
#print('tag_id', tag_id)
#print('tag_id', tag_id)
elif
n
>
1
:
#
elif n > 1:
tag_id
=
1
#
tag_id = 1
#tag_id = cache.Tag[ngram[0][1]].id
#tag_id = cache.Tag[ngram[0][1]].id
#tag_id = cache.Tag['NN'].id
#tag_id = cache.Tag['NN'].id
#tag_id = 14
#tag_id = 14
...
@@ -255,7 +265,7 @@ def extract_ngrams(corpus, keys):
...
@@ -255,7 +265,7 @@ def extract_ngrams(corpus, keys):
node_ngram_list
[
node_id
][
terms
]
+=
1
node_ngram_list
[
node_id
][
terms
]
+=
1
ngrams_data
.
add
((
terms
[:
255
],
n
))
ngrams_data
.
add
((
terms
[:
255
],
n
))
ngrams_language_data
.
add
((
terms
,
language_id
))
ngrams_language_data
.
add
((
terms
,
language_id
))
ngrams_tag_data
.
add
((
terms
,
tag_id
))
#
ngrams_tag_data.add((terms, tag_id))
# insert ngrams to temporary table
# insert ngrams to temporary table
dbg
.
show
(
'find ids for the
%
d ngrams'
%
len
(
ngrams_data
))
dbg
.
show
(
'find ids for the
%
d ngrams'
%
len
(
ngrams_data
))
...
@@ -263,12 +273,12 @@ def extract_ngrams(corpus, keys):
...
@@ -263,12 +273,12 @@ def extract_ngrams(corpus, keys):
ngram_ids
=
insert_ngrams
(
ngrams_data
)
ngram_ids
=
insert_ngrams
(
ngrams_data
)
dbg
.
show
(
'insert associations'
)
dbg
.
show
(
'insert associations'
)
node_ngram_data
=
lis
t
()
node_ngram_data
=
se
t
()
for
node_id
,
ngrams
in
node_ngram_list
.
items
():
for
node_id
,
ngrams
in
node_ngram_list
.
items
():
for
terms
,
weight
in
ngrams
.
items
():
for
terms
,
weight
in
ngrams
.
items
():
try
:
try
:
ngram_id
=
ngram_ids
[
terms
]
ngram_id
=
ngram_ids
[
terms
]
node_ngram_data
.
a
ppen
d
((
node_id
,
ngram_id
,
weight
,
))
node_ngram_data
.
a
d
d
((
node_id
,
ngram_id
,
weight
,
))
except
Exception
as
e
:
except
Exception
as
e
:
print
(
"err01:"
,
e
)
print
(
"err01:"
,
e
)
bulk_insert
(
Node_Ngram
,
[
'node_id'
,
'ngram_id'
,
'weight'
],
node_ngram_data
,
cursor
=
cursor
)
bulk_insert
(
Node_Ngram
,
[
'node_id'
,
'ngram_id'
,
'weight'
],
node_ngram_data
,
cursor
=
cursor
)
...
...
rest_v1_0/graph.py
View file @
1160395d
...
@@ -21,19 +21,21 @@ class Graph(APIView):
...
@@ -21,19 +21,21 @@ class Graph(APIView):
format_
=
request
.
GET
.
get
(
'format'
,
'json'
)
format_
=
request
.
GET
.
get
(
'format'
,
'json'
)
type_
=
request
.
GET
.
get
(
'type'
,
'node_link'
)
type_
=
request
.
GET
.
get
(
'type'
,
'node_link'
)
apax
=
request
.
GET
.
get
(
'apax'
,
2
)
corpus
=
session
.
query
(
Node
)
.
filter
(
Node
.
id
==
corpus_id
)
.
first
()
corpus
=
session
.
query
(
Node
)
.
filter
(
Node
.
id
==
corpus_id
)
.
first
()
accepted_field1
=
[
'ngrams'
,
'journal'
,
'source'
,
'authors'
]
accepted_field1
=
[
'ngrams'
,
'journal'
,
'source'
,
'authors'
]
accepted_field2
=
[
'ngrams'
,]
accepted_field2
=
[
'ngrams'
,]
options
=
[
'start'
,
'end'
,
'apax'
]
if
field1
in
accepted_field1
:
if
field1
in
accepted_field1
:
if
field2
in
accepted_field2
:
if
field2
in
accepted_field2
:
if
start
is
not
None
and
end
is
not
None
:
if
start
is
not
None
and
end
is
not
None
:
data
=
get_cooc
(
corpus
=
corpus
,
field1
=
field1
,
field2
=
field2
,
start
=
start
,
end
=
end
)
data
=
get_cooc
(
corpus
=
corpus
,
field1
=
field1
,
field2
=
field2
,
start
=
start
,
end
=
end
,
apax
=
apax
)
else
:
else
:
data
=
get_cooc
(
corpus
=
corpus
,
field1
=
field1
,
field2
=
field2
)
data
=
get_cooc
(
corpus
=
corpus
,
field1
=
field1
,
field2
=
field2
,
apax
=
apax
)
if
format_
==
'json'
:
if
format_
==
'json'
:
return
JsonHttpResponse
(
data
)
return
JsonHttpResponse
(
data
)
else
:
else
:
...
@@ -41,4 +43,5 @@ class Graph(APIView):
...
@@ -41,4 +43,5 @@ class Graph(APIView):
'Warning USAGE'
:
'One field for each range:'
'Warning USAGE'
:
'One field for each range:'
,
'field1'
:
accepted_field1
,
'field1'
:
accepted_field1
,
'field2'
:
accepted_field2
,
'field2'
:
accepted_field2
,
'options'
:
options
})
})
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment