Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
gargantext
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
humanities
gargantext
Commits
ea5c610f
Commit
ea5c610f
authored
Feb 23, 2015
by
Administrator
Browse files
Options
Browse Files
Download
Plain Diff
Merge branch 'unstable' into testing
parents
9cdf6c1e
5fd8afc0
Changes
5
Hide whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
161 additions
and
13 deletions
+161
-13
functions.py
analysis/functions.py
+6
-5
models.py
node/models.py
+150
-7
__init__.py
parsing/FileParsers/__init__.py
+1
-0
NgramsExtractor.py
parsing/NgramsExtractors/NgramsExtractor.py
+2
-0
views.py
scrap_pubmed/views.py
+2
-1
No files found.
analysis/functions.py
View file @
ea5c610f
...
...
@@ -268,19 +268,20 @@ def get_cooc(request=None, corpus_id=None, cooc_id=None, type='node_link', n=150
from
analysis.tfidf
import
tfidf
def
do_tfidf
(
corpus
,
reset
=
True
):
print
(
"
doing tfidf
"
)
print
(
"
=========== doing tfidf ===========
"
)
with
transaction
.
atomic
():
if
reset
==
True
:
NodeNodeNgram
.
objects
.
filter
(
nodex
=
corpus
)
.
delete
()
if
isinstance
(
corpus
,
Node
)
and
corpus
.
type
.
name
==
"Corpus"
:
# print("\n- - - - - - - - - - ")
# for i in Node.objects.filter(parent=corpus, type=NodeType.objects.get(name="Document")):
# print("^^^",i)
# # for i in Node.objects.filter(parent=corpus, type=NodeType.objects.get(name="Document")):
for
document
in
Node
.
objects
.
filter
(
parent
=
corpus
,
type
=
NodeType
.
objects
.
get
(
name
=
"Document"
)):
for
node_ngram
in
Node_Ngram
.
objects
.
filter
(
node
=
document
):
# print("the doc:",document)
somevariable
=
Node_Ngram
.
objects
.
filter
(
node
=
document
)
for
node_ngram
in
somevariable
:
try
:
# print("\t
",node_ngram.ngram
)
# print("\t
ngram:",node_ngram.ngram, " @@@ type:",type(node_ngram.ngram)
)
nnn
=
NodeNodeNgram
.
objects
.
get
(
nodex
=
corpus
,
nodey
=
document
,
ngram
=
node_ngram
.
ngram
)
except
:
score
=
tfidf
(
corpus
,
document
,
node_ngram
.
ngram
)
...
...
node/models.py
View file @
ea5c610f
...
...
@@ -10,8 +10,11 @@ from cte_tree.models import CTENode, CTENodeManager
from
parsing.Caches
import
LanguagesCache
,
NgramsExtractorsCache
,
NgramsCaches
from
parsing.FileParsers
import
*
from
time
import
time
import
datetime
from
multiprocessing
import
Process
from
collections
import
defaultdict
import
hashlib
...
...
@@ -175,12 +178,6 @@ class Node(CTENode):
'europress_english'
:
EuropressFileParser
,
})[
resource
.
type
.
name
]()
metadata_list
+=
parser
.
parse
(
str
(
resource
.
file
))
# print(parser.parse(str(resource.file)))
# # retrieve info from the database
# print("\n - - -- - - - - - - - ")
# for i in metadata_list:
# print("***",i["title"])
# print("- - -- - - - - - - - \n")
type_id
=
NodeType
.
objects
.
get
(
name
=
'Document'
)
.
id
langages_cache
=
LanguagesCache
()
user_id
=
self
.
user
.
id
...
...
@@ -208,7 +205,6 @@ class Node(CTENode):
# mark the resources as parsed for this node
self
.
node_resource
.
update
(
parsed
=
True
)
@
current_app
.
task
(
filter
=
task_method
)
def
extract_ngrams
(
self
,
keys
,
ngramsextractorscache
=
None
,
ngramscaches
=
None
):
# if there is no cache...
...
...
@@ -233,6 +229,9 @@ class Node(CTENode):
for
ngram
in
extractor
.
extract_ngrams
(
self
.
metadata
[
key
]):
terms
=
' '
.
join
([
token
for
token
,
tag
in
ngram
])
associations
[
terms
]
+=
1
# import pprint
# pprint.pprint(associations)
#print(associations)
# insert the occurrences in the database
# print(associations.items())
...
...
@@ -282,6 +281,150 @@ class Node(CTENode):
self
.
metadata
[
'Processing'
]
=
0
self
.
save
()
def
parse_resources__MOV
(
self
,
verbose
=
False
):
# parse all resources into a list of metadata
metadata_list
=
[]
print
(
"not parsed resources:"
)
print
(
self
.
node_resource
.
filter
(
parsed
=
False
))
print
(
"= = = = = = = = = = =
\n
"
)
for
node_resource
in
self
.
node_resource
.
filter
(
parsed
=
False
):
resource
=
node_resource
.
resource
parser
=
defaultdict
(
lambda
:
FileParser
.
FileParser
,
{
'istext'
:
ISText
,
'pubmed'
:
PubmedFileParser
,
'isi'
:
IsiFileParser
,
'ris'
:
RisFileParser
,
'europress'
:
EuropressFileParser
,
'europress_french'
:
EuropressFileParser
,
'europress_english'
:
EuropressFileParser
,
})[
resource
.
type
.
name
]()
metadata_list
+=
parser
.
parse
(
str
(
resource
.
file
))
self
.
node_resource
.
update
(
parsed
=
True
)
#writing to DB
return
metadata_list
def
writeMetadata__MOV
(
self
,
metadata_list
=
None
,
verbose
=
False
):
type_id
=
NodeType
.
objects
.
get
(
name
=
'Document'
)
.
id
user_id
=
self
.
user
.
id
langages_cache
=
LanguagesCache
()
# # insert the new resources in the database!
for
i
,
metadata_values
in
enumerate
(
metadata_list
):
name
=
metadata_values
.
get
(
'title'
,
''
)[:
200
]
language
=
langages_cache
[
metadata_values
[
'language_iso2'
]]
if
'language_iso2'
in
metadata_values
else
None
,
if
isinstance
(
language
,
tuple
):
language
=
language
[
0
]
Node
(
user_id
=
user_id
,
type_id
=
type_id
,
name
=
name
,
parent
=
self
,
language_id
=
language
.
id
if
language
else
None
,
metadata
=
metadata_values
)
.
save
()
metadata_list
[
i
][
"thelang"
]
=
language
# # make metadata filterable
self
.
children
.
all
()
.
make_metadata_filterable
()
# # mark the resources as parsed for this node
self
.
node_resource
.
update
(
parsed
=
True
)
def
extract_ngrams__MOV
(
self
,
array
,
keys
,
ngramsextractorscache
=
None
,
ngramscaches
=
None
):
if
ngramsextractorscache
is
None
:
ngramsextractorscache
=
NgramsExtractorsCache
()
langages_cache
=
LanguagesCache
()
if
ngramscaches
is
None
:
ngramscaches
=
NgramsCaches
()
for
metadata
in
array
:
associations
=
defaultdict
(
float
)
# float or int?
language
=
langages_cache
[
metadata
[
'language_iso2'
]]
if
'language_iso2'
in
metadata
else
None
,
if
isinstance
(
language
,
tuple
):
language
=
language
[
0
]
metadata
[
"thelang"
]
=
language
extractor
=
ngramsextractorscache
[
language
]
ngrams
=
ngramscaches
[
language
]
# print("\t\t number of req keys:",len(keys)," AND isdict?:",isinstance(keys, dict))
if
isinstance
(
keys
,
dict
):
for
key
,
weight
in
keys
.
items
():
if
key
in
metadata
:
for
ngram
in
extractor
.
extract_ngrams
(
metadata
[
key
]):
terms
=
' '
.
join
([
token
for
token
,
tag
in
ngram
])
associations
[
ngram
]
+=
weight
else
:
for
key
in
keys
:
if
key
in
metadata
:
# print("the_content:[[[[[[__",metadata[key],"__]]]]]]")
for
ngram
in
extractor
.
extract_ngrams
(
metadata
[
key
]):
terms
=
' '
.
join
([
token
for
token
,
tag
in
ngram
])
associations
[
terms
]
+=
1
if
len
(
associations
.
items
())
>
0
:
Node_Ngram
.
objects
.
bulk_create
([
Node_Ngram
(
node
=
self
,
ngram
=
ngrams
[
ngram_text
],
weight
=
weight
)
for
ngram_text
,
weight
in
associations
.
items
()
])
# for ngram_text, weight in associations.items():
# print("ngram_text:",ngram_text," | weight:",weight, " | ngrams[ngram_text]:",ngrams[ngram_text])
def
runInParallel
(
self
,
*
fns
):
proc
=
[]
for
fn
in
fns
:
p
=
Process
(
target
=
fn
)
p
.
start
()
proc
.
append
(
p
)
for
p
in
proc
:
p
.
join
()
def
workflow__MOV
(
self
,
keys
=
None
,
ngramsextractorscache
=
None
,
ngramscaches
=
None
,
verbose
=
False
):
import
time
total
=
0
self
.
metadata
[
'Processing'
]
=
1
self
.
save
()
print
(
"LOG::TIME: In workflow() parse_resources__MOV()"
)
start
=
time
.
time
()
theMetadata
=
self
.
parse_resources__MOV
()
end
=
time
.
time
()
total
+=
(
end
-
start
)
print
(
"LOG::TIME:_ "
+
datetime
.
datetime
.
now
()
.
isoformat
()
+
" parse_resources()__MOV [s]"
,(
end
-
start
))
print
(
"LOG::TIME: In workflow() writeMetadata__MOV()"
)
start
=
time
.
time
()
self
.
writeMetadata__MOV
(
metadata_list
=
theMetadata
)
end
=
time
.
time
()
total
+=
(
end
-
start
)
print
(
"LOG::TIME:_ "
+
datetime
.
datetime
.
now
()
.
isoformat
()
+
" writeMetadata__MOV() [s]"
,(
end
-
start
))
print
(
"LOG::TIME: In workflow() extract_ngrams__MOV()"
)
start
=
time
.
time
()
self
.
extract_ngrams__MOV
(
theMetadata
,
keys
=
[
'title'
,
'abstract'
,]
)
end
=
time
.
time
()
total
+=
(
end
-
start
)
print
(
"LOG::TIME:_ "
+
datetime
.
datetime
.
now
()
.
isoformat
()
+
" extract_ngrams__MOV() [s]"
,(
end
-
start
))
# # this is not working
# self.runInParallel( self.writeMetadata__MOV( metadata_list=theMetadata ) , self.extract_ngrams__MOV(theMetadata , keys=['title','abstract',] ) )
start
=
time
.
time
()
print
(
"LOG::TIME: In workflow() do_tfidf()"
)
from
analysis.functions
import
do_tfidf
do_tfidf
(
self
)
end
=
time
.
time
()
total
+=
(
end
-
start
)
print
(
"LOG::TIME:_ "
+
datetime
.
datetime
.
now
()
.
isoformat
()
+
" do_tfidf() [s]"
,(
end
-
start
))
# # print("LOG::TIME: In workflow() / do_tfidf()")
print
(
"LOG::TIME:_ "
+
datetime
.
datetime
.
now
()
.
isoformat
()
+
" In workflow() END"
)
self
.
metadata
[
'Processing'
]
=
0
self
.
save
()
class
Node_Metadata
(
models
.
Model
):
node
=
models
.
ForeignKey
(
Node
,
on_delete
=
models
.
CASCADE
)
metadata
=
models
.
ForeignKey
(
Metadata
)
...
...
parsing/FileParsers/__init__.py
View file @
ea5c610f
...
...
@@ -2,3 +2,4 @@ from .RisFileParser import RisFileParser
from
.IsiFileParser
import
IsiFileParser
from
.PubmedFileParser
import
PubmedFileParser
from
.EuropressFileParser
import
EuropressFileParser
from
.ISText
import
ISText
parsing/NgramsExtractors/NgramsExtractor.py
View file @
ea5c610f
...
...
@@ -29,6 +29,8 @@ class NgramsExtractor:
"""
def
extract_ngrams
(
self
,
contents
):
tagged_ngrams
=
self
.
tagger
.
tag_text
(
contents
)
if
len
(
tagged_ngrams
)
==
0
:
return
[]
grammar
=
nltk
.
RegexpParser
(
self
.
_rule
)
result
=
[]
# try:
...
...
scrap_pubmed/views.py
View file @
ea5c610f
...
...
@@ -133,6 +133,7 @@ def doTheQuery(request , project_id):
try
:
if
DEBUG
is
True
:
corpus
.
workflow
()
# corpus.workflow__MOV()
else
:
corpus
.
workflow
.
apply_async
((),
countdown
=
3
)
...
...
@@ -205,7 +206,7 @@ def testISTEX(request , project_id):
corpus
.
save
()
print
(
"DEBUG:"
,
DEBUG
)
# do the WorkFlow
try
:
if
DEBUG
is
True
:
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment