Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
gargantext
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
humanities
gargantext
Commits
8f980cc5
Commit
8f980cc5
authored
Oct 09, 2014
by
Administrator
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
extract stuff (sql query factor
parent
ed5cf272
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
58 additions
and
40 deletions
+58
-40
extract.py
analysis/extract.py
+58
-40
No files found.
analysis/extract.py
View file @
8f980cc5
...
@@ -15,10 +15,10 @@ from analysis.languages import english_stem
...
@@ -15,10 +15,10 @@ from analysis.languages import english_stem
# from analysis.languages import french_stem as stem
# from analysis.languages import french_stem as stem
# print("Selection langue anglaise")
# print("Selection langue anglaise")
stemmer
=
EnglishStemmer
()
stemmer
=
EnglishStemmer
()
l
=
set
()
l
=
set
()
# du format: terms, stems, count
d
=
defaultdict
(
lambda
:
\
d
=
defaultdict
(
lambda
:
\
defaultdict
(
lambda
:
\
defaultdict
(
lambda
:
\
...
@@ -26,48 +26,93 @@ d = defaultdict( lambda:\
...
@@ -26,48 +26,93 @@ d = defaultdict( lambda:\
defaultdict
(
int
)
\
defaultdict
(
int
)
\
)))
)))
#if isinstance(corpus, Corpus) and field in [ column.name for column in Document._meta.fields]:
# if isinstance(corpus, Corpus) and field in [ column.name for column in Document._meta.fields]:
def
save_newgrams
(
new_grams
):
NgramTemporary
.
objects
.
bulk_create
(
new_grams
)
NgramDocumentTemporary
.
objects
.
bulk_create
(
new_gramDoc
)
cursor
=
connection
.
cursor
()
# LOCK TABLE documents_ngramtemporary IN EXCLUSIVE MODE;
query_string
=
"""
INSERT INTO documents_ngram
SELECT * FROM documents_ngramtemporary WHERE NOT EXISTS
( SELECT 1 FROM documents_ngram WHERE
documents_ngram.terms = documents_ngramtemporary.terms);
delete from documents_ngramtemporary;
INSERT INTO
documents_ngramdocument (terms_id, document_id, occurrences)
SELECT
GT.id, DT.id, NDT.occurrences
FROM
documents_ngramdocumenttemporary as NDT
INNER JOIN documents_document AS DT ON DT.id = NDT.document
INNER JOIN documents_ngram AS GT ON GT.terms = NDT.terms ;
delete from documents_ngramdocumenttemporary;
"""
cursor
.
execute
(
query_string
)
def
words_field
(
corpus
=
None
,
field
=
'abstract'
):
def
words_field
(
corpus
=
None
,
field
=
'abstract'
):
docs
=
Document
.
objects
.
filter
(
corpus
=
corpus
)
docs
=
Document
.
objects
.
filter
(
corpus
=
corpus
)
def
ngrams
(
text
,
grammar_rule
=
'jj_nn'
):
def
fouille
(
text
,
grammar_rule
=
'jj_nn'
):
# TODO : grammar_rule
# TODO : grammar_rule
from
analysis.grammar_rules
import
jj_nn
as
rule
from
analysis.grammar_rules
import
jj_nn
as
rule
grammar
=
nltk
.
RegexpParser
(
rule
)
grammar
=
nltk
.
RegexpParser
(
rule
)
#text = clean(text)
#text = clean(text)
sentances
=
nltk
.
sent_tokenize
(
text
)
sentances
=
nltk
.
sent_tokenize
(
text
)
result
=
[]
result
=
[]
for
sentance
in
sentances
:
for
sentance
in
sentances
:
try
:
try
:
t
=
pos_tag
(
sentance
)
t
=
pos_tag
(
sentance
)
g
=
grammar
.
parse
(
t
)
g
=
grammar
.
parse
(
t
)
x
=
g
.
subtrees
()
x
=
g
.
subtrees
()
while
True
:
while
True
:
try
:
try
:
subtree
=
next
(
x
)
subtree
=
next
(
x
)
if
subtree
.
label
()
==
'NP'
:
if
subtree
.
label
()
==
'NP'
:
#print(subtree.label())
#print(subtree.label())
result
.
append
(
subtree
.
leaves
())
result
.
append
(
subtree
.
leaves
())
except
Exception
as
e
:
except
Exception
as
e
:
break
break
except
Exception
as
e
:
except
Exception
as
e
:
print
(
e
)
print
(
e
)
pass
pass
return
iter
(
result
)
return
iter
(
result
)
def
ograms
(
text
,
field
=
doc
.
abstract
)
try
:
sentences
=
nltk
.
sent_tokenize
(
field
)
words
=
[
nltk
.
wordpunct_tokenize
(
str
(
sentence
))
for
sentence
in
sentences
]
for
word
in
words
[
0
]:
try
:
stems
=
stemmer
.
stem
(
str
(
word
))
new
=
(
word
,
stems
,
len
(
stems
.
split
(
" "
)))
l
.
add
(
new
)
d
[
word
][
doc
.
id
][
'count'
]
=
d
[
word
][
doc
.
pk
]
.
get
(
'count'
,
0
)
+
1
except
Exception
as
e
:
pass
#print(e)
except
Exception
as
e
:
pass
#print(e)
for
doc
in
docs
:
for
doc
in
docs
:
try
:
try
:
sentences
=
nltk
.
sent_tokenize
(
doc
.
abstract
)
sentences
=
nltk
.
sent_tokenize
(
doc
.
abstract
)
words
=
[
nltk
.
wordpunct_tokenize
(
str
(
sentence
))
for
sentence
in
sentences
]
words
=
[
nltk
.
wordpunct_tokenize
(
str
(
sentence
))
for
sentence
in
sentences
]
for
word
in
words
[
0
]:
for
word
in
words
[
0
]:
try
:
try
:
stems
=
stemmer
.
stem
(
str
(
word
))
stems
=
stemmer
.
stem
(
str
(
word
))
...
@@ -79,42 +124,15 @@ def words_field(corpus=None, field='abstract'):
...
@@ -79,42 +124,15 @@ def words_field(corpus=None, field='abstract'):
#
#
except
Exception
as
e
:
pass
#print(e)
except
Exception
as
e
:
pass
#print(e)
# l = liste
# du format: terms, stems, count
new_grams
=
[
Ngram
(
terms
=
x
[
0
],
stem
=
x
[
1
],
n
=
x
[
2
])
for
x
in
l
]
new_grams
=
[
Ngram
(
terms
=
x
[
0
],
stem
=
x
[
1
],
n
=
x
[
2
])
for
x
in
l
]
new_gramDoc
=
[
NgramDocumentTemporary
(
terms
=
k
,
document
=
pk
,
occurrences
=
d
[
k
][
pk
][
'count'
])
\
new_gramDoc
=
[
NgramDocumentTemporary
(
terms
=
k
,
document
=
pk
,
occurrences
=
d
[
k
][
pk
][
'count'
])
\
for
k
in
d
.
keys
()
\
for
k
in
d
.
keys
()
\
for
pk
in
d
[
k
]
.
keys
()
\
for
pk
in
d
[
k
]
.
keys
()
]
]
NgramTemporary
.
objects
.
bulk_create
(
new_grams
)
NgramDocumentTemporary
.
objects
.
bulk_create
(
new_gramDoc
)
cursor
=
connection
.
cursor
()
# LOCK TABLE documents_ngramtemporary IN EXCLUSIVE MODE;
query_string
=
"""
INSERT INTO documents_ngram
SELECT * FROM documents_ngramtemporary WHERE NOT EXISTS
( SELECT 1 FROM documents_ngram WHERE
documents_ngram.terms = documents_ngramtemporary.terms);
delete from documents_ngramtemporary;
INSERT INTO
documents_ngramdocument (terms_id, document_id, occurrences)
SELECT
GT.id, DT.id, NDT.occurrences
FROM
documents_ngramdocumenttemporary as NDT
INNER JOIN documents_document AS DT ON DT.id = NDT.document
INNER JOIN documents_ngram AS GT ON GT.terms = NDT.terms ;
delete from documents_ngramdocumenttemporary;
"""
cursor
.
execute
(
query_string
)
save_newgrams
(
new_grams
)
def
words_fields
(
corpus
=
None
,
fields
=
[
'title'
,]):
def
words_fields
(
corpus
=
None
,
fields
=
[
'title'
,]):
try
:
try
:
for
field
in
fields
:
for
field
in
fields
:
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment