Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
gargantext
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
humanities
gargantext
Commits
28bb9273
Commit
28bb9273
authored
Feb 25, 2015
by
PkSM3
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
[UPDATE] progress in workflow__MOV
parent
4fbc8a98
Changes
6
Hide whitespace changes
Inline
Side-by-side
Showing
6 changed files
with
89 additions
and
57 deletions
+89
-57
functions.py
analysis/functions.py
+2
-3
tfidf.py
analysis/tfidf.py
+1
-0
models.py
node/models.py
+79
-51
PubmedFileParser.py
parsing/FileParsers/PubmedFileParser.py
+2
-0
NgramsExtractor.py
parsing/NgramsExtractors/NgramsExtractor.py
+2
-2
Tagger.py
parsing/Taggers/Tagger.py
+3
-1
No files found.
analysis/functions.py
View file @
28bb9273
...
...
@@ -268,7 +268,7 @@ def get_cooc(request=None, corpus_id=None, cooc_id=None, type='node_link', n=150
from
analysis.tfidf
import
tfidf
def
do_tfidf
(
corpus
,
reset
=
True
):
print
(
"=========== doing tfidf ==========="
)
#
print("=========== doing tfidf ===========")
with
transaction
.
atomic
():
if
reset
==
True
:
NodeNodeNgram
.
objects
.
filter
(
nodex
=
corpus
)
.
delete
()
...
...
@@ -278,8 +278,7 @@ def do_tfidf(corpus, reset=True):
# # for i in Node.objects.filter(parent=corpus, type=NodeType.objects.get(name="Document")):
for
document
in
Node
.
objects
.
filter
(
parent
=
corpus
,
type
=
NodeType
.
objects
.
get
(
name
=
"Document"
)):
# print("the doc:",document)
somevariable
=
Node_Ngram
.
objects
.
filter
(
node
=
document
)
for
node_ngram
in
somevariable
:
for
node_ngram
in
Node_Ngram
.
objects
.
filter
(
node
=
document
):
# print("\tngram:",node_ngram.ngram)
try
:
nnn
=
NodeNodeNgram
.
objects
.
get
(
nodex
=
corpus
,
nodey
=
document
,
ngram
=
node_ngram
.
ngram
)
...
...
analysis/tfidf.py
View file @
28bb9273
...
...
@@ -60,6 +60,7 @@ def tfidf(corpus, document, ngram):
.
filter
(
NodeNgram
.
ngram_id
==
ngram
.
id
)
\
.
count
()
# print("\t\t\t","occs:",occurrences_of_ngram," || ngramsbydoc:",ngrams_by_document," || TF = occ/ngramsbydoc:",term_frequency," |||||| x:",xx," || y:",yy," || IDF = log(x/y):",log(xx/yy))
inverse_document_frequency
=
log
(
xx
/
yy
)
# result = tf * idf
...
...
node/models.py
View file @
28bb9273
...
...
@@ -14,6 +14,7 @@ from parsing.FileParsers import *
from
time
import
time
import
datetime
from
multiprocessing
import
Process
from
math
import
log
from
collections
import
defaultdict
import
hashlib
...
...
@@ -229,13 +230,6 @@ class Node(CTENode):
for
ngram
in
extractor
.
extract_ngrams
(
self
.
metadata
[
key
]):
terms
=
' '
.
join
([
token
for
token
,
tag
in
ngram
])
associations
[
terms
]
+=
1
import
pprint
pprint
.
pprint
(
associations
)
print
(
" - - - - - "
)
#print(associations)
# insert the occurrences in the database
# print(associations.items())
Node_Ngram
.
objects
.
bulk_create
([
Node_Ngram
(
node
=
self
,
...
...
@@ -284,6 +278,14 @@ class Node(CTENode):
def
runInParallel
(
self
,
*
fns
):
proc
=
[]
for
fn
in
fns
:
p
=
Process
(
target
=
fn
)
p
.
start
()
proc
.
append
(
p
)
for
p
in
proc
:
p
.
join
()
def
parse_resources__MOV
(
self
,
verbose
=
False
):
# parse all resources into a list of metadata
...
...
@@ -324,7 +326,6 @@ class Node(CTENode):
language_id
=
language
.
id
if
language
else
None
,
metadata
=
metadata_values
)
.
save
()
metadata_list
[
i
][
"thelang"
]
=
language
# # make metadata filterable
self
.
children
.
all
()
.
make_metadata_filterable
()
# # mark the resources as parsed for this node
...
...
@@ -338,48 +339,63 @@ class Node(CTENode):
if
ngramscaches
is
None
:
ngramscaches
=
NgramsCaches
()
results
=
[]
i
=
0
for
metadata
in
array
:
associations
=
defaultdict
(
float
)
# float or int?
language
=
langages_cache
[
metadata
[
'language_iso2'
]]
if
'language_iso2'
in
metadata
else
None
,
if
isinstance
(
language
,
tuple
):
language
=
language
[
0
]
metadata
[
"thelang"
]
=
language
extractor
=
ngramsextractorscache
[
language
]
ngrams
=
ngramscaches
[
language
]
# print("\t\t number of req keys:",len(keys)," AND isdict?:",isinstance(keys, dict))
# theText = []
if
isinstance
(
keys
,
dict
):
for
key
,
weight
in
keys
.
items
():
if
key
in
metadata
:
for
ngram
in
extractor
.
extract_ngrams
(
metadata
[
key
]):
terms
=
' '
.
join
([
token
for
token
,
tag
in
ngram
])
text2process
=
str
(
metadata
[
key
])
.
replace
(
'['
,
''
)
.
replace
(
']'
,
''
)
# theText.append(text2process)
for
ngram
in
extractor
.
extract_ngrams
(
text2process
):
terms
=
' '
.
join
([
token
for
token
,
tag
in
ngram
])
.
strip
()
.
lower
()
associations
[
ngram
]
+=
weight
else
:
for
key
in
keys
:
if
key
in
metadata
:
# print("the_content:[[[[[[__",metadata[key],"__]]]]]]")
for
ngram
in
extractor
.
extract_ngrams
(
metadata
[
key
]):
terms
=
' '
.
join
([
token
for
token
,
tag
in
ngram
])
text2process
=
str
(
metadata
[
key
])
.
replace
(
'['
,
''
)
.
replace
(
']'
,
''
)
# theText.append(text2process)
for
ngram
in
extractor
.
extract_ngrams
(
text2process
):
terms
=
' '
.
join
([
token
for
token
,
tag
in
ngram
])
.
strip
()
.
lower
()
associations
[
terms
]
+=
1
if
len
(
associations
.
items
())
>
0
:
Node_Ngram
.
objects
.
bulk_create
([
Node_Ngram
(
node
=
self
,
ngram
=
ngrams
[
ngram_text
],
weight
=
weight
)
for
ngram_text
,
weight
in
associations
.
items
()
])
# for ngram_text, weight in associations.items():
# print("ngram_text:",ngram_text," | weight:",weight, " | ngrams[ngram_text]:",ngrams[ngram_text])
if
(
len
(
associations
)
>
0
):
results
.
append
(
[
i
,
associations
]
)
i
+=
1
return
results
def
runInParallel
(
self
,
*
fns
):
proc
=
[]
for
fn
in
fns
:
p
=
Process
(
target
=
fn
)
p
.
start
()
proc
.
append
(
p
)
for
p
in
proc
:
p
.
join
()
def
do_tfidf__MOV
(
self
,
FreqList
):
IDFList
=
{}
for
i
in
FreqList
:
arrayID
=
i
[
0
]
associations
=
i
[
1
]
for
ngram_text
,
weight
in
associations
.
items
():
if
ngram_text
in
IDFList
:
IDFList
[
ngram_text
]
+=
1
else
:
IDFList
[
ngram_text
]
=
1
N
=
float
(
len
(
FreqList
))
#nro docs really processed
for
i
in
FreqList
:
arrayID
=
i
[
0
]
associations
=
i
[
1
]
ngrams_by_document
=
len
(
associations
.
items
())
for
ngram_text
,
weight
in
associations
.
items
():
occurrences_of_ngram
=
weight
term_frequency
=
occurrences_of_ngram
/
ngrams_by_document
xx
=
N
yy
=
IDFList
[
ngram_text
]
inverse_document_frequency
=
log
(
xx
/
yy
)
#log base e
def
workflow__MOV
(
self
,
keys
=
None
,
ngramsextractorscache
=
None
,
ngramscaches
=
None
,
verbose
=
False
):
import
time
...
...
@@ -394,32 +410,44 @@ class Node(CTENode):
total
+=
(
end
-
start
)
print
(
"LOG::TIME:_ "
+
datetime
.
datetime
.
now
()
.
isoformat
()
+
" parse_resources()__MOV [s]"
,(
end
-
start
))
print
(
"LOG::TIME: In workflow() writeMetadata__MOV()"
)
start
=
time
.
time
()
self
.
writeMetadata__MOV
(
metadata_list
=
theMetadata
)
end
=
time
.
time
()
total
+=
(
end
-
start
)
print
(
"LOG::TIME:_ "
+
datetime
.
datetime
.
now
()
.
isoformat
()
+
" writeMetadata__MOV() [s]"
,(
end
-
start
))
#
print("LOG::TIME: In workflow() writeMetadata__MOV()")
#
start = time.time()
#
self.writeMetadata__MOV( metadata_list=theMetadata )
#
end = time.time()
#
total += (end - start)
#
print ("LOG::TIME:_ "+datetime.datetime.now().isoformat()+" writeMetadata__MOV() [s]",(end - start))
print
(
"LOG::TIME: In workflow() extract_ngrams__MOV()"
)
start
=
time
.
time
()
self
.
extract_ngrams__MOV
(
theMetadata
,
keys
=
[
'title'
,
'abstract
'
,]
)
FreqList
=
self
.
extract_ngrams__MOV
(
theMetadata
,
keys
=
[
'title
'
,]
)
end
=
time
.
time
()
total
+=
(
end
-
start
)
print
(
"LOG::TIME:_ "
+
datetime
.
datetime
.
now
()
.
isoformat
()
+
" extract_ngrams__MOV() [s]"
,(
end
-
start
))
# # this is not working
# self.runInParallel( self.writeMetadata__MOV( metadata_list=theMetadata ) , self.extract_ngrams__MOV(theMetadata , keys=['title','abstract',] ) )
# start = time.time()
# print("LOG::TIME: In workflow() do_tfidf()")
# self.do_tfidf__MOV( FreqList )
# end = time.time()
# total += (end - start)
# print ("LOG::TIME:_ "+datetime.datetime.now().isoformat()+" do_tfidf() [s]",(end - start))
# # # print("LOG::TIME: In workflow() / do_tfidf()")
# # # this is not working
# # self.runInParallel( self.writeMetadata__MOV( metadata_list=theMetadata ) , self.extract_ngrams__MOV(theMetadata , keys=['title','abstract',] ) )
start
=
time
.
time
()
print
(
"LOG::TIME: In workflow() do_tfidf()"
)
from
analysis.functions
import
do_tfidf
do_tfidf
(
self
)
end
=
time
.
time
()
total
+=
(
end
-
start
)
print
(
"LOG::TIME:_ "
+
datetime
.
datetime
.
now
()
.
isoformat
()
+
" do_tfidf() [s]"
,(
end
-
start
))
# # print("LOG::TIME: In workflow() / do_tfidf()")
#
start = time.time()
#
print("LOG::TIME: In workflow() do_tfidf()")
#
from analysis.functions import do_tfidf
#
do_tfidf(self)
#
end = time.time()
#
total += (end - start)
#
print ("LOG::TIME:_ "+datetime.datetime.now().isoformat()+" do_tfidf() [s]",(end - start))
# #
#
print("LOG::TIME: In workflow() / do_tfidf()")
print
(
"LOG::TIME:_ "
+
datetime
.
datetime
.
now
()
.
isoformat
()
+
" In workflow() END"
)
...
...
parsing/FileParsers/PubmedFileParser.py
View file @
28bb9273
...
...
@@ -25,6 +25,7 @@ class PubmedFileParser(FileParser):
metadata_path
=
{
"journal"
:
'MedlineCitation/Article/Journal/Title'
,
"title"
:
'MedlineCitation/Article/ArticleTitle'
,
# "abstract" : 'MedlineCitation/Article/Abstract/AbstractText',
"title2"
:
'MedlineCitation/Article/VernacularTitle'
,
"language_iso3"
:
'MedlineCitation/Article/Language'
,
"doi"
:
'PubmedData/ArticleIdList/ArticleId[@type=doi]'
,
...
...
@@ -102,6 +103,7 @@ class PubmedFileParser(FileParser):
if
"title2"
in
metadata
:
metadata
.
pop
(
"title2"
)
# print(metadata)
# print("* * * * ** * * * * ")
metadata_list
.
append
(
metadata
)
# return the list of metadata
return
metadata_list
parsing/NgramsExtractors/NgramsExtractor.py
View file @
28bb9273
from
..Taggers
import
Tagger
from
..Taggers
import
T
urboT
agger
import
nltk
...
...
@@ -18,7 +18,7 @@ class NgramsExtractor:
self
.
stop
()
def
start
(
self
):
self
.
tagger
=
Tagger
()
self
.
tagger
=
T
urboT
agger
()
def
stop
(
self
):
pass
...
...
parsing/Taggers/Tagger.py
View file @
28bb9273
...
...
@@ -58,9 +58,11 @@ class Tagger:
if
single
:
self
.
tagging_end
()
return
[]
"""Send a text to be tagged.
"""
# Not used right now
def
tag_text
(
self
,
text
):
tokens_tags
=
[]
self
.
tagging_start
()
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment