Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
gargantext
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
humanities
gargantext
Commits
2b36c5d0
Commit
2b36c5d0
authored
10 years ago
by
PkSM3
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
[UPDATE] pubmed|istex removal (for the moment)
parent
fc089d47
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
117 additions
and
456 deletions
+117
-456
models.py
node/models.py
+0
-339
views.py
scrap_pubmed/views.py
+117
-117
No files found.
node/models.py
View file @
2b36c5d0
...
@@ -287,346 +287,7 @@ class Node(CTENode):
...
@@ -287,346 +287,7 @@ class Node(CTENode):
print
(
"In workflow() END"
)
print
(
"In workflow() END"
)
self
.
metadata
[
'Processing'
]
=
0
self
.
metadata
[
'Processing'
]
=
0
self
.
save
()
self
.
save
()
def
runInParallel
(
self
,
*
fns
):
proc
=
[]
for
fn
in
fns
:
p
=
Process
(
target
=
fn
)
p
.
start
()
proc
.
append
(
p
)
for
p
in
proc
:
p
.
join
()
def
parse_resources__MOV
(
self
,
verbose
=
False
):
# parse all resources into a list of metadata
metadata_list
=
[]
print
(
"not parsed resources:"
)
print
(
self
.
node_resource
.
filter
(
parsed
=
False
))
print
(
"= = = = = = = = = = =
\n
"
)
for
node_resource
in
self
.
node_resource
.
filter
(
parsed
=
False
):
resource
=
node_resource
.
resource
parser
=
defaultdict
(
lambda
:
FileParser
.
FileParser
,
{
'istext'
:
ISText
,
'pubmed'
:
PubmedFileParser
,
'isi'
:
IsiFileParser
,
'ris'
:
RisFileParser
,
'europress'
:
EuropressFileParser
,
'europress_french'
:
EuropressFileParser
,
'europress_english'
:
EuropressFileParser
,
})[
resource
.
type
.
name
]()
metadata_list
+=
parser
.
parse
(
str
(
resource
.
file
))
self
.
node_resource
.
update
(
parsed
=
True
)
#writing to DB
return
metadata_list
def
writeMetadata__MOV
(
self
,
metadata_list
=
None
,
verbose
=
False
):
type_id
=
NodeType
.
objects
.
get
(
name
=
'Document'
)
.
id
user_id
=
self
.
user
.
id
langages_cache
=
LanguagesCache
()
# # insert the new resources in the database!
for
i
,
metadata_values
in
enumerate
(
metadata_list
):
name
=
metadata_values
.
get
(
'title'
,
''
)[:
200
]
language
=
langages_cache
[
metadata_values
[
'language_iso2'
]]
if
'language_iso2'
in
metadata_values
else
None
,
if
isinstance
(
language
,
tuple
):
language
=
language
[
0
]
try
:
node
=
Node
(
user_id
=
user_id
,
type_id
=
type_id
,
name
=
name
,
parent
=
self
,
language_id
=
language
.
id
if
language
else
None
,
metadata
=
metadata_values
)
node
.
save
()
except
Exception
as
e
:
print
(
"ERR::node.models.writeMetadata__MOV "
)
metadata_values
[
"id"
]
=
node
.
id
# # make metadata filterable
self
.
children
.
all
()
.
make_metadata_filterable
()
# # mark the resources as parsed for this node
self
.
node_resource
.
update
(
parsed
=
True
)
return
metadata_list
def
extract_ngrams__MOV
(
self
,
array
,
keys
,
ngramsextractorscache
=
None
,
ngramscaches
=
None
):
if
ngramsextractorscache
is
None
:
ngramsextractorscache
=
NgramsExtractorsCache
()
langages_cache
=
LanguagesCache
()
if
ngramscaches
is
None
:
ngramscaches
=
NgramsCaches
()
results
=
[]
i
=
0
for
metadata
in
array
:
associations
=
defaultdict
(
float
)
# float or int?
language
=
langages_cache
[
metadata
[
'language_iso2'
]]
if
'language_iso2'
in
metadata
else
None
,
if
isinstance
(
language
,
tuple
):
language
=
language
[
0
]
extractor
=
ngramsextractorscache
[
language
]
ngrams
=
ngramscaches
[
language
]
# theText = []
if
isinstance
(
keys
,
dict
):
for
key
,
weight
in
keys
.
items
():
if
key
in
metadata
:
text2process
=
str
(
metadata
[
key
])
.
replace
(
'['
,
''
)
.
replace
(
']'
,
''
)
# theText.append(text2process)
for
ngram
in
extractor
.
extract_ngrams
(
text2process
):
terms
=
' '
.
join
([
token
for
token
,
tag
in
ngram
])
.
strip
()
.
lower
()
associations
[
ngram
]
+=
weight
else
:
for
key
in
keys
:
if
key
in
metadata
:
text2process
=
str
(
metadata
[
key
])
.
replace
(
'['
,
''
)
.
replace
(
']'
,
''
)
# theText.append(text2process)
for
ngram
in
extractor
.
extract_ngrams
(
text2process
):
terms
=
' '
.
join
([
token
for
token
,
tag
in
ngram
])
.
strip
()
.
lower
()
associations
[
terms
]
+=
1
if
(
len
(
associations
)
>
0
):
results
.
append
(
[
metadata
[
"id"
]
,
associations
]
)
i
+=
1
return
results
def
do_tfidf__MOV
(
self
,
FreqList
,
Metadata
,
n
=
150
):
from
analysis.InterUnion
import
Utils
calc
=
Utils
()
# *01* [ calculating global count of each ngram ]
GlobalCount
=
{}
for
i
in
FreqList
:
docID
=
i
[
0
]
associations
=
i
[
1
]
for
ngram_text
,
weight
in
associations
.
items
():
if
len
(
ngram_text
.
split
())
>
1
and
len
(
ngram_text
.
split
())
<
4
:
# considering just {2,3}-grams
if
ngram_text
in
GlobalCount
:
if
"C"
in
GlobalCount
[
ngram_text
]:
GlobalCount
[
ngram_text
][
"C"
]
+=
1
else
:
GlobalCount
[
ngram_text
]
=
{}
GlobalCount
[
ngram_text
][
"C"
]
=
1
else
:
GlobalCount
[
ngram_text
]
=
{}
GlobalCount
[
ngram_text
][
"C"
]
=
1
# *01* [ / calculating global count of each ngram ]
# *02* [ Considering the first <150 ngrams by DESC occurrences ]
FirstNgrams
=
{}
sortedList
=
sorted
(
GlobalCount
,
key
=
lambda
x
:
GlobalCount
[
x
][
'C'
],
reverse
=
True
)
for
i
in
range
(
len
(
sortedList
)):
term
=
sortedList
[
i
]
FirstNgrams
[
term
]
=
{}
FirstNgrams
[
term
][
"C"
]
=
GlobalCount
[
term
][
"C"
]
if
i
==
(
n
-
1
):
break
# *02* [ / Considering the first <150 ngrams by DESC occurrences ]
N
=
float
(
len
(
FreqList
))
#nro docs really processed
# *03* [ making dictionaries for NGram_Text <=> NGram_ID ]
NGram2ID
=
{}
ID2NGram
=
{}
ngramid
=
0
for
i
in
FirstNgrams
:
NGram2ID
[
i
]
=
ngramid
ID2NGram
[
ngramid
]
=
i
ngramid
+=
1
# *03* [ / making dictionaries for NGram_Text <=> NGram_ID ]
docs_X_terms
=
{}
for
i
in
FreqList
:
# foreach ID in Doc:
docID
=
i
[
0
]
associations
=
i
[
1
]
# [ considering just {2,3}-grams ]
termsCount
=
0
for
ngram_text
,
weight
in
associations
.
items
():
if
ngram_text
in
NGram2ID
:
# considering just {2,3}-grams
termsCount
+=
1
# [ / considering just {2,3}-grams ]
ngrams_by_document
=
termsCount
# i re-calculed this because of *02*
terms
=
[]
terms_occ
=
[]
if
ngrams_by_document
>
0
:
for
ngram_text
,
weight
in
associations
.
items
():
if
ngram_text
in
NGram2ID
:
terms
.
append
(
NGram2ID
[
ngram_text
])
# [ calculating TF-IDF ]
occurrences_of_ngram
=
weight
term_frequency
=
occurrences_of_ngram
/
ngrams_by_document
xx
=
N
yy
=
FirstNgrams
[
ngram_text
][
"C"
]
inverse_document_frequency
=
log
(
xx
/
yy
)
#log base e
tfidfScore
=
term_frequency
*
inverse_document_frequency
terms_occ
.
append
(
[
NGram2ID
[
ngram_text
]
,
round
(
tfidfScore
,
3
)
]
)
# [ / calculating TF-IDF ]
if
"T"
in
FirstNgrams
[
ngram_text
]:
FirstNgrams
[
ngram_text
][
"T"
]
.
append
(
tfidfScore
)
else
:
FirstNgrams
[
ngram_text
][
"T"
]
=
[
tfidfScore
]
if
len
(
terms
)
>
1
:
docs_X_terms
[
docID
]
=
terms_occ
# print("docid:",docID)
# for i in terms:
# print("\t",ID2NGram[i])
calc
.
addCompleteSubGraph
(
terms
)
return
{
"G"
:
calc
.
G
,
"TERMS"
:
ID2NGram
,
"ii"
:
docs_X_terms
,
"metrics"
:
FirstNgrams
}
def
do_coocmatrix__MOV
(
self
,
TERMS
,
G
,
n
=
150
,
type
=
'node_link'
):
import
pandas
as
pd
from
copy
import
copy
import
numpy
as
np
import
networkx
as
nx
from
networkx.readwrite
import
json_graph
from
gargantext_web.api
import
JsonHttpResponse
from
analysis.louvain
import
best_partition
matrix
=
defaultdict
(
lambda
:
defaultdict
(
float
))
ids
=
dict
()
labels
=
dict
()
weight
=
dict
()
print
(
"PRINTING NUMBER OF NODES 01:"
,
len
(
G
))
for
e
in
G
.
edges_iter
():
n1
=
e
[
0
]
n2
=
e
[
1
]
w
=
G
[
n1
][
n2
][
'weight'
]
# print(n1," <=> ",n2, " : ", G[n1][n2]['weight'],"\t",TERMS[n1]," <=> ",TERMS[n2], "\t", G[n1][n2]['weight'])
ids
[
TERMS
[
n1
]]
=
n1
ids
[
TERMS
[
n2
]]
=
n2
labels
[
n1
]
=
TERMS
[
n1
]
labels
[
n2
]
=
TERMS
[
n2
]
matrix
[
n1
][
n2
]
=
w
matrix
[
n2
][
n1
]
=
w
weight
[
n2
]
=
weight
.
get
(
n2
,
0
)
+
w
weight
[
n1
]
=
weight
.
get
(
n1
,
0
)
+
w
df
=
pd
.
DataFrame
(
matrix
)
.
fillna
(
0
)
x
=
copy
(
df
.
values
)
x
=
x
/
x
.
sum
(
axis
=
1
)
# Removing unconnected nodes
threshold
=
min
(
x
.
max
(
axis
=
1
))
matrix_filtered
=
np
.
where
(
x
>=
threshold
,
1
,
0
)
#matrix_filtered = np.where(x > threshold, x, 0)
#matrix_filtered = matrix_filtered.resize((90,90))
G
=
nx
.
from_numpy_matrix
(
matrix_filtered
)
# G = nx.relabel_nodes(G, dict(enumerate([ labels[label] for label in list(df.columns)])))
partition
=
best_partition
(
G
)
data
=
[]
if
type
==
"node_link"
:
for
community
in
set
(
partition
.
values
()):
G
.
add_node
(
"cluster "
+
str
(
community
),
hidden
=
1
)
for
node
in
G
.
nodes
():
try
:
G
.
node
[
node
][
'label'
]
=
TERMS
[
node
]
G
.
node
[
node
][
'pk'
]
=
node
G
.
node
[
node
][
'size'
]
=
weight
[
node
]
G
.
node
[
node
][
'group'
]
=
partition
[
node
]
G
.
add_edge
(
node
,
"cluster "
+
str
(
partition
[
node
]),
weight
=
3
)
except
Exception
as
error
:
print
(
"ERROR:"
,
error
)
print
(
"PRINTING NUMBER OF NODES 02:"
,
len
(
G
))
data
=
json_graph
.
node_link_data
(
G
)
elif
type
==
"adjacency"
:
for
node
in
G
.
nodes
():
try
:
#node,type(labels[node])
#G.node[node]['label'] = node
G
.
node
[
node
][
'name'
]
=
node
#G.node[node]['size'] = weight[node]
G
.
node
[
node
][
'group'
]
=
partition
[
node
]
#G.add_edge(node, partition[node], weight=3)
except
Exception
as
error
:
print
(
error
)
data
=
json_graph
.
node_link_data
(
G
)
return
data
def
workflow__MOV
(
self
,
keys
=
None
,
ngramsextractorscache
=
None
,
ngramscaches
=
None
,
verbose
=
False
):
import
time
total
=
0
self
.
metadata
[
'Processing'
]
=
1
self
.
save
()
# # pwd = subprocess.Popen("cd /srv/gargantext/parsing/Taggers/nlpserver && pwd", stdout=subprocess.PIPE).stdout.read()
# # print(subprocess.Popen(['ls', '-lah'], stdout=subprocess.PIPE).communicate()[0].decode('utf-8'))
# print("activating nlpserver:")
# command = 'cd parsing/Taggers/nlpserver; python3 server.py'
# process = subprocess.Popen(command,stdout=subprocess.PIPE , stderr=subprocess.DEVNULL , shell=True)
print
(
"LOG::TIME: In workflow() parse_resources__MOV()"
)
start
=
time
.
time
()
theMetadata
=
self
.
parse_resources__MOV
()
end
=
time
.
time
()
total
+=
(
end
-
start
)
print
(
"LOG::TIME:_ "
+
datetime
.
datetime
.
now
()
.
isoformat
()
+
" parse_resources()__MOV [s]"
,(
end
-
start
))
print
(
"LOG::TIME: In workflow() writeMetadata__MOV()"
)
start
=
time
.
time
()
theMetadata
=
self
.
writeMetadata__MOV
(
metadata_list
=
theMetadata
)
end
=
time
.
time
()
total
+=
(
end
-
start
)
print
(
"LOG::TIME:_ "
+
datetime
.
datetime
.
now
()
.
isoformat
()
+
" writeMetadata__MOV() [s]"
,(
end
-
start
))
print
(
"LOG::TIME: In workflow() extract_ngrams__MOV()"
)
start
=
time
.
time
()
FreqList
=
self
.
extract_ngrams__MOV
(
theMetadata
,
keys
=
[
'title'
]
)
end
=
time
.
time
()
total
+=
(
end
-
start
)
print
(
"LOG::TIME:_ "
+
datetime
.
datetime
.
now
()
.
isoformat
()
+
" extract_ngrams__MOV() [s]"
,(
end
-
start
))
# process.kill()
# print("ok, process killed")
start
=
time
.
time
()
print
(
"LOG::TIME: In workflow() do_tfidf()"
)
resultDict
=
self
.
do_tfidf__MOV
(
FreqList
,
theMetadata
)
end
=
time
.
time
()
total
+=
(
end
-
start
)
print
(
"LOG::TIME:_ "
+
datetime
.
datetime
.
now
()
.
isoformat
()
+
" do_tfidf() [s]"
,(
end
-
start
))
# # print("LOG::TIME: In workflow() / do_tfidf()")
start
=
time
.
time
()
print
(
"LOG::TIME: In workflow() do_coocmatrix()"
)
jsongraph
=
self
.
do_coocmatrix__MOV
(
resultDict
[
"TERMS"
]
,
resultDict
[
"G"
]
,
n
=
150
)
jsongraph
[
"stats"
]
=
resultDict
[
"ii"
]
end
=
time
.
time
()
total
+=
(
end
-
start
)
print
(
"LOG::TIME:_ "
+
datetime
.
datetime
.
now
()
.
isoformat
()
+
" do_coocmatrix() [s]"
,(
end
-
start
))
print
(
"the user:"
,
self
.
user
)
print
(
"the project id:"
,
self
.
parent
.
id
)
print
(
"the corpus id:"
,
self
.
id
)
# timestamp = str(datetime.datetime.now().isoformat())
# # filename = MEDIA_ROOT + '/corpora/%s/%s_%s__%s.json' % (self.user , self.parent.id, self.id , timestamp)
filename
=
MEDIA_ROOT
+
'/corpora/
%
s/
%
s_
%
s.json'
%
(
self
.
user
,
self
.
parent
.
id
,
self
.
id
)
import
json
f
=
open
(
filename
,
"w"
)
f
.
write
(
json
.
dumps
(
jsongraph
)
)
f
.
close
()
# # # this is not working
# # self.runInParallel( self.writeMetadata__MOV( metadata_list=theMetadata ) , self.extract_ngrams__MOV(theMetadata , keys=['title','abstract',] ) )
print
(
"LOG::TIME:_ "
+
datetime
.
datetime
.
now
()
.
isoformat
()
+
" In workflow() END"
)
self
.
metadata
[
'Processing'
]
=
0
self
.
save
()
class
Node_Metadata
(
models
.
Model
):
class
Node_Metadata
(
models
.
Model
):
node
=
models
.
ForeignKey
(
Node
,
on_delete
=
models
.
CASCADE
)
node
=
models
.
ForeignKey
(
Node
,
on_delete
=
models
.
CASCADE
)
...
...
This diff is collapsed.
Click to expand it.
scrap_pubmed/views.py
View file @
2b36c5d0
...
@@ -73,75 +73,75 @@ def doTheQuery(request , project_id):
...
@@ -73,75 +73,75 @@ def doTheQuery(request , project_id):
alist
=
[
"hola"
,
"mundo"
]
alist
=
[
"hola"
,
"mundo"
]
if
request
.
method
==
"POST"
:
if
request
.
method
==
"POST"
:
query
=
request
.
POST
[
"query"
]
#
query = request.POST["query"]
name
=
request
.
POST
[
"string"
]
#
name = request.POST["string"]
instancia
=
MedlineFetcher
()
#
instancia = MedlineFetcher()
thequeries
=
json
.
loads
(
query
)
#
thequeries = json.loads(query)
urlreqs
=
[]
#
urlreqs = []
for
yearquery
in
thequeries
:
#
for yearquery in thequeries:
urlreqs
.
append
(
instancia
.
medlineEfetchRAW
(
yearquery
)
)
#
urlreqs.append( instancia.medlineEfetchRAW( yearquery ) )
alist
=
[
"tudo fixe"
,
"tudo bem"
]
#
alist = ["tudo fixe" , "tudo bem"]
"""
#
"""
urlreqs: List of urls to query.
#
urlreqs: List of urls to query.
- Then, to each url in urlreqs you do:
#
- Then, to each url in urlreqs you do:
eFetchResult = urlopen(url)
#
eFetchResult = urlopen(url)
eFetchResult.read() # this will output the XML... normally you write this to a XML-file.
#
eFetchResult.read() # this will output the XML... normally you write this to a XML-file.
"""
#
"""
thefile
=
"how we do this here?"
#
thefile = "how we do this here?"
resource_type
=
ResourceType
.
objects
.
get
(
name
=
"pubmed"
)
#
resource_type = ResourceType.objects.get(name="pubmed" )
parent
=
Node
.
objects
.
get
(
id
=
project_id
)
#
parent = Node.objects.get(id=project_id)
node_type
=
NodeType
.
objects
.
get
(
name
=
'Corpus'
)
#
node_type = NodeType.objects.get(name='Corpus')
type_id
=
NodeType
.
objects
.
get
(
name
=
'Document'
)
.
id
#
type_id = NodeType.objects.get(name='Document').id
user_id
=
User
.
objects
.
get
(
username
=
request
.
user
)
.
id
#
user_id = User.objects.get( username=request.user ).id
corpus
=
Node
(
#
corpus = Node(
user
=
request
.
user
,
#
user=request.user,
parent
=
parent
,
#
parent=parent,
type
=
node_type
,
#
type=node_type,
name
=
name
,
#
name=name,
)
#
)
corpus
.
save
()
#
corpus.save()
tasks
=
MedlineFetcher
()
#
tasks = MedlineFetcher()
for
i
in
range
(
8
):
#
for i in range(8):
t
=
threading
.
Thread
(
target
=
tasks
.
worker2
)
#thing to do
#
t = threading.Thread(target=tasks.worker2) #thing to do
t
.
daemon
=
True
# thread dies when main thread (only non-daemon thread) exits.
#
t.daemon = True # thread dies when main thread (only non-daemon thread) exits.
t
.
start
()
#
t.start()
for
url
in
urlreqs
:
#
for url in urlreqs:
filename
=
MEDIA_ROOT
+
'/corpora/
%
s/
%
s'
%
(
request
.
user
,
str
(
datetime
.
datetime
.
now
()
.
isoformat
()))
#
filename = MEDIA_ROOT + '/corpora/%s/%s' % (request.user, str(datetime.datetime.now().isoformat()))
tasks
.
q
.
put
(
[
url
,
filename
])
#put a task in th queue
#
tasks.q.put( [url , filename]) #put a task in th queue
tasks
.
q
.
join
()
# wait until everything is finished
#
tasks.q.join() # wait until everything is finished
dwnldsOK
=
0
#
dwnldsOK = 0
for
filename
in
tasks
.
firstResults
:
#
for filename in tasks.firstResults:
if
filename
!=
False
:
#
if filename!=False:
corpus
.
add_resource
(
user
=
request
.
user
,
type
=
resource_type
,
file
=
filename
)
#
corpus.add_resource( user=request.user, type=resource_type, file=filename )
dwnldsOK
+=
1
#
dwnldsOK+=1
if
dwnldsOK
==
0
:
return
JsonHttpResponse
([
"fail"
])
#
if dwnldsOK == 0: return JsonHttpResponse(["fail"])
# do the WorkFlow
#
#
do the WorkFlow
try
:
#
try:
if
DEBUG
is
True
:
#
if DEBUG is True:
# corpus.workflow() # old times...
#
# corpus.workflow() # old times...
corpus
.
workflow__MOV
()
#
corpus.workflow__MOV()
# corpus.write_everything_to_DB()
#
# corpus.write_everything_to_DB()
else
:
#
else:
# corpus.workflow.apply_async((), countdown=3)
#
# corpus.workflow.apply_async((), countdown=3)
corpus
.
workflow__MOV
()
.
apply_async
((),
countdown
=
3
)
# synchronous! because is faaast
#
corpus.workflow__MOV().apply_async((), countdown=3) # synchronous! because is faaast
# corpus.write_everything_to_DB.apply_async((), countdown=3) # asynchronous
#
# corpus.write_everything_to_DB.apply_async((), countdown=3) # asynchronous
return
JsonHttpResponse
([
"workflow"
,
"finished"
])
#
return JsonHttpResponse(["workflow","finished"])
except
Exception
as
error
:
#
except Exception as error:
print
(
error
)
#
print(error)
return
JsonHttpResponse
([
"
workflow"
,
"finished"
,
"outside the try-excep
t"
])
return
JsonHttpResponse
([
"
out of service for the momen
t"
])
data
=
alist
data
=
alist
return
JsonHttpResponse
(
data
)
return
JsonHttpResponse
(
data
)
...
@@ -164,59 +164,59 @@ def testISTEX(request , project_id):
...
@@ -164,59 +164,59 @@ def testISTEX(request , project_id):
print
(
query_string
,
query
,
N
)
print
(
query_string
,
query
,
N
)
urlreqs
=
[]
#
urlreqs = []
pagesize
=
50
#
pagesize = 50
tasks
=
MedlineFetcher
()
#
tasks = MedlineFetcher()
chunks
=
list
(
tasks
.
chunks
(
range
(
N
),
pagesize
))
#
chunks = list(tasks.chunks(range(N), pagesize))
for
k
in
chunks
:
#
for k in chunks:
if
(
k
[
0
]
+
pagesize
)
>
N
:
pagesize
=
N
-
k
[
0
]
#
if (k[0]+pagesize)>N: pagesize = N-k[0]
urlreqs
.
append
(
"http://api.istex.fr/document/?q="
+
query_string
+
"&output=*&"
+
"from="
+
str
(
k
[
0
])
+
"&size="
+
str
(
pagesize
))
#
urlreqs.append("http://api.istex.fr/document/?q="+query_string+"&output=*&"+"from="+str(k[0])+"&size="+str(pagesize))
print
(
urlreqs
)
#
print(urlreqs)
urlreqs
=
[
"http://localhost/374255"
,
"http://localhost/374278"
]
#
urlreqs = ["http://localhost/374255" , "http://localhost/374278" ]
print
(
urlreqs
)
#
print(urlreqs)
resource_type
=
ResourceType
.
objects
.
get
(
name
=
"istext"
)
#
resource_type = ResourceType.objects.get(name="istext" )
parent
=
Node
.
objects
.
get
(
id
=
project_id
)
#
parent = Node.objects.get(id=project_id)
node_type
=
NodeType
.
objects
.
get
(
name
=
'Corpus'
)
#
node_type = NodeType.objects.get(name='Corpus')
type_id
=
NodeType
.
objects
.
get
(
name
=
'Document'
)
.
id
#
type_id = NodeType.objects.get(name='Document').id
user_id
=
User
.
objects
.
get
(
username
=
request
.
user
)
.
id
#
user_id = User.objects.get( username=request.user ).id
corpus
=
Node
(
#
corpus = Node(
user
=
request
.
user
,
#
user=request.user,
parent
=
parent
,
#
parent=parent,
type
=
node_type
,
#
type=node_type,
name
=
query
,
#
name=query,
)
#
)
corpus
.
save
()
#
corpus.save()
# configuring your queue with the event
#
#
configuring your queue with the event
for
i
in
range
(
8
):
#
for i in range(8):
t
=
threading
.
Thread
(
target
=
tasks
.
worker2
)
#thing to do
#
t = threading.Thread(target=tasks.worker2) #thing to do
t
.
daemon
=
True
# thread dies when main thread (only non-daemon thread) exits.
#
t.daemon = True # thread dies when main thread (only non-daemon thread) exits.
t
.
start
()
#
t.start()
for
url
in
urlreqs
:
#
for url in urlreqs:
filename
=
MEDIA_ROOT
+
'/corpora/
%
s/
%
s'
%
(
request
.
user
,
str
(
datetime
.
now
()
.
microsecond
))
#
filename = MEDIA_ROOT + '/corpora/%s/%s' % (request.user, str(datetime.now().microsecond))
tasks
.
q
.
put
(
[
url
,
filename
])
#put a task in th queue
#
tasks.q.put( [url , filename]) #put a task in th queue
tasks
.
q
.
join
()
# wait until everything is finished
#
tasks.q.join() # wait until everything is finished
for
filename
in
tasks
.
firstResults
:
#
for filename in tasks.firstResults:
corpus
.
add_resource
(
user
=
request
.
user
,
type
=
resource_type
,
file
=
filename
)
#
corpus.add_resource( user=request.user, type=resource_type, file=filename )
corpus
.
save
()
#
corpus.save()
print
(
"DEBUG:"
,
DEBUG
)
#
print("DEBUG:",DEBUG)
# do the WorkFlow
#
#
do the WorkFlow
try
:
#
try:
if
DEBUG
is
True
:
#
if DEBUG is True:
corpus
.
workflow
()
#
corpus.workflow()
else
:
#
else:
corpus
.
workflow
.
apply_async
((),
countdown
=
3
)
#
corpus.workflow.apply_async((), countdown=3)
return
JsonHttpResponse
([
"workflow"
,
"finished"
])
#
return JsonHttpResponse(["workflow","finished"])
except
Exception
as
error
:
#
except Exception as error:
print
(
error
)
#
print(error)
data
=
[
query_string
,
query
,
N
]
data
=
[
query_string
,
query
,
N
]
return
JsonHttpResponse
(
data
)
return
JsonHttpResponse
(
data
)
...
...
This diff is collapsed.
Click to expand it.
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment