Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
gargantext
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
humanities
gargantext
Commits
d0003ef9
Commit
d0003ef9
authored
Feb 03, 2015
by
PkSM3
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
[FEAT] pubmed scrapper
parent
ed1311f3
Changes
10
Expand all
Show whitespace changes
Inline
Side-by-side
Showing
10 changed files
with
466 additions
and
118 deletions
+466
-118
functions.py
analysis/functions.py
+0
-1
urls.py
gargantext_web/urls.py
+2
-1
views.py
gargantext_web/views.py
+19
-4
admin.py
node/admin.py
+2
-5
models.py
node/models.py
+24
-6
PubmedFileParser.py
parsing/FileParsers/PubmedFileParser.py
+21
-7
__init__.py
parsing/FileParsers/__init__.py
+1
-0
MedlineFetcherDavid2015.py
scrap_pubmed/MedlineFetcherDavid2015.py
+74
-51
views.py
scrap_pubmed/views.py
+94
-13
project.html
templates/project.html
+229
-30
No files found.
analysis/functions.py
View file @
d0003ef9
...
...
@@ -274,7 +274,6 @@ def do_tfidf(corpus, reset=True):
NodeNodeNgram
.
objects
.
filter
(
nodex
=
corpus
)
.
delete
()
if
isinstance
(
corpus
,
Node
)
and
corpus
.
type
.
name
==
"Corpus"
:
print
(
Node
.
objects
.
filter
(
parent
=
corpus
,
type
=
NodeType
.
objects
.
get
(
name
=
"Document"
)))
for
document
in
Node
.
objects
.
filter
(
parent
=
corpus
,
type
=
NodeType
.
objects
.
get
(
name
=
"Document"
)):
for
node_ngram
in
Node_Ngram
.
objects
.
filter
(
node
=
document
):
try
:
...
...
gargantext_web/urls.py
View file @
d0003ef9
...
...
@@ -69,7 +69,8 @@ urlpatterns = patterns('',
url
(
r'^tests/mvc-listdocuments$'
,
views
.
tests_mvc_listdocuments
),
url
(
r'^tests/pubmedquery$'
,
pubmedscrapper
.
getGlobalStats
),
url
(
r'^tests/project/(\d+)/pubmedquery/go$'
,
pubmedscrapper
.
doTheQuery
)
url
(
r'^tests/project/(\d+)/pubmedquery/go$'
,
pubmedscrapper
.
doTheQuery
),
url
(
r'^tests/project/(\d+)/ISTEXquery/go$'
,
pubmedscrapper
.
testISTEX
)
)
...
...
gargantext_web/views.py
View file @
d0003ef9
...
...
@@ -223,8 +223,14 @@ def project(request, project_id):
corpus_view
[
'count'
]
=
corpus
.
children
.
count
()
#just get first element of the corpora and get his type.
resource_corpus
=
Node_Resource
.
objects
.
filter
(
node
=
corpus
)
if
len
(
resource_corpus
)
>
0
:
# print(Node_Resource.objects.filter(node=corpus).all())
corpus_type
=
Node_Resource
.
objects
.
filter
(
node
=
corpus
)[
0
]
.
resource
.
type
list_corpora
[
corpus_type
]
.
append
(
corpus_view
)
donut_part
[
corpus_type
]
+=
docs_count
else
:
print
(
" Node_Resource = this.corpus("
,
corpus
.
pk
,
") ... nothing, why?"
)
## For avoiding to list repeated elements, like when u use the dynamic query (per each xml, 1)
# for node_resource in Node_Resource.objects.filter(node=corpus):
...
...
@@ -237,6 +243,8 @@ def project(request, project_id):
if
docs_total
==
0
or
docs_total
is
None
:
docs_total
=
1
# The donut will show: percentage by
donut
=
[
{
'source'
:
key
,
'count'
:
donut_part
[
key
]
,
'part'
:
round
(
donut_part
[
key
]
*
100
/
docs_total
)
}
\
...
...
@@ -246,12 +254,15 @@ def project(request, project_id):
if
request
.
method
==
'POST'
:
form
=
CustomForm
(
request
.
POST
,
request
.
FILES
)
if
form
.
is_valid
():
name
=
form
.
cleaned_data
[
'name'
]
thefile
=
form
.
cleaned_data
[
'file'
]
resource_type
=
ResourceType
.
objects
.
get
(
id
=
str
(
form
.
cleaned_data
[
'type'
]
))
print
(
request
.
POST
[
'type'
])
print
(
form
.
cleaned_data
[
'type'
])
resource_type
=
ResourceType
.
objects
.
get
(
name
=
str
(
form
.
cleaned_data
[
'type'
]
))
print
(
"-------------"
)
print
(
name
,
"|"
,
resource_type
,
"|"
,
thefile
)
...
...
@@ -327,6 +338,7 @@ def project(request, project_id):
else
:
form
=
CustomForm
()
return
render
(
request
,
'project.html'
,
{
'form'
:
form
,
'user'
:
user
,
...
...
@@ -748,9 +760,12 @@ def node_link(request, corpus_id):
'''
Create the HttpResponse object with the node_link dataset.
'''
import
time
print
(
"In node_link() START"
)
start
=
time
.
time
()
data
=
get_cooc
(
request
=
request
,
corpus_id
=
corpus_id
,
type
=
"node_link"
)
end
=
time
.
time
()
print
(
"LOG::TIME: get_cooc() [s]"
,(
end
-
start
))
print
(
"In node_link() END"
)
return
JsonHttpResponse
(
data
)
...
...
node/admin.py
View file @
d0003ef9
...
...
@@ -98,13 +98,10 @@ from django import forms
from
django.utils.translation
import
ugettext_lazy
as
_
class
CustomForm
(
forms
.
Form
):
name
=
forms
.
CharField
(
label
=
'Name'
,
max_length
=
199
,
required
=
True
)
parsing_options
=
ResourceType
.
objects
.
all
()
.
values_list
(
'id'
,
'name'
)
type
=
forms
.
IntegerField
(
widget
=
forms
.
Select
(
choices
=
parsing_options
)
,
required
=
True
)
name
=
forms
.
CharField
(
label
=
'Name'
,
max_length
=
199
,
widget
=
forms
.
TextInput
(
attrs
=
{
'required'
:
'true'
}))
type
=
ModelChoiceField
(
ResourceType
.
objects
.
all
()
,
widget
=
forms
.
Select
(
attrs
=
{
'onchange'
:
'CustomForSelect( $("option:selected", this).text() );'
})
)
file
=
forms
.
FileField
()
# Description: clean_file()
"""
* file_.content_type - Example: ['application/pdf', 'image/jpeg']
...
...
node/models.py
View file @
d0003ef9
...
...
@@ -163,6 +163,7 @@ class Node(CTENode):
for
node_resource
in
self
.
node_resource
.
filter
(
parsed
=
False
):
resource
=
node_resource
.
resource
parser
=
defaultdict
(
lambda
:
FileParser
.
FileParser
,
{
'istext'
:
ISText
,
'pubmed'
:
PubmedFileParser
,
'isi'
:
IsiFileParser
,
'ris'
:
RisFileParser
,
...
...
@@ -171,6 +172,7 @@ class Node(CTENode):
'europress_english'
:
EuropressFileParser
,
})[
resource
.
type
.
name
]()
metadata_list
+=
parser
.
parse
(
str
(
resource
.
file
))
# print(parser.parse(str(resource.file)))
# retrieve info from the database
type_id
=
NodeType
.
objects
.
get
(
name
=
'Document'
)
.
id
langages_cache
=
LanguagesCache
()
...
...
@@ -183,6 +185,8 @@ class Node(CTENode):
language
=
langages_cache
[
metadata_values
[
'language_iso2'
]]
if
'language_iso2'
in
metadata_values
else
None
,
if
isinstance
(
language
,
tuple
):
language
=
language
[
0
]
# print("metadata_values:")
# print("\t",metadata_values,"\n- - - - - - - - - - - - ")
Node
(
user_id
=
user_id
,
type_id
=
type_id
,
...
...
@@ -191,7 +195,6 @@ class Node(CTENode):
language_id
=
language
.
id
if
language
else
None
,
metadata
=
metadata_values
)
.
save
()
# make metadata filterable
self
.
children
.
all
()
.
make_metadata_filterable
()
...
...
@@ -236,17 +239,32 @@ class Node(CTENode):
@
current_app
.
task
(
filter
=
task_method
)
def
workflow
(
self
,
keys
=
None
,
ngramsextractorscache
=
None
,
ngramscaches
=
None
,
verbose
=
False
):
print
(
"In workflow() parse_resources()"
)
import
time
print
(
"LOG::TIME: In workflow() parse_resources()"
)
start
=
time
.
time
()
self
.
parse_resources
()
print
(
"In workflow() / parse_resources()"
)
print
(
"In workflow() extract_ngrams()"
)
end
=
time
.
time
()
print
(
"LOG::TIME: parse_resources() [s]"
,(
end
-
start
))
print
(
"LOG::TIME: In workflow() / parse_resources()"
)
start
=
time
.
time
()
print
(
"LOG::TIME: In workflow() extract_ngrams()"
)
type_document
=
NodeType
.
objects
.
get
(
name
=
'Document'
)
self
.
children
.
filter
(
type_id
=
type_document
.
pk
)
.
extract_ngrams
(
keys
=
[
'title'
,])
print
(
"In workflow() / extract_ngrams()"
)
end
=
time
.
time
()
print
(
"LOG::TIME: "
,(
end
-
start
))
print
(
"LOG::TIME: extract_ngrams() [s]"
,(
end
-
start
))
print
(
"LOG::TIME: In workflow() / extract_ngrams()"
)
start
=
time
.
time
()
print
(
"In workflow() do_tfidf()"
)
from
analysis.functions
import
do_tfidf
do_tfidf
(
self
)
print
(
"In workflow() / do_tfidf()"
)
end
=
time
.
time
()
print
(
"LOG::TIME: do_tfidf() [s]"
,(
end
-
start
))
print
(
"LOG::TIME: In workflow() / do_tfidf()"
)
print
(
"In workflow() END"
)
class
Node_Metadata
(
models
.
Model
):
...
...
parsing/FileParsers/PubmedFileParser.py
View file @
d0003ef9
...
...
@@ -25,6 +25,7 @@ class PubmedFileParser(FileParser):
metadata_path
=
{
"journal"
:
'MedlineCitation/Article/Journal/Title'
,
"title"
:
'MedlineCitation/Article/ArticleTitle'
,
"title2"
:
'MedlineCitation/Article/VernacularTitle'
,
"language_iso3"
:
'MedlineCitation/Article/Language'
,
"doi"
:
'PubmedData/ArticleIdList/ArticleId[@type=doi]'
,
"realdate_full_"
:
'MedlineCitation/Article/Journal/JournalIssue/PubDate/MedlineDate'
,
...
...
@@ -51,6 +52,13 @@ class PubmedFileParser(FileParser):
except
:
pass
#Title-Decision
Title
=
""
if
not
metadata
[
"title"
]
or
metadata
[
"title"
]
==
""
:
if
"title2"
in
metadata
:
metadata
[
"title"
]
=
metadata
[
"title2"
]
else
:
metadata
[
"title"
]
=
""
# Date-Decision
# forge.iscpif.fr/issues/1418
RealDate
=
""
...
...
@@ -68,19 +76,25 @@ class PubmedFileParser(FileParser):
if
"publication_month"
in
metadata
:
PubmedDate
+=
" "
+
metadata
[
"publication_month"
]
if
"publication_day"
in
metadata
:
PubmedDate
+=
" "
+
metadata
[
"publication_day"
]
Decision
=
""
if
len
(
RealDate
)
>
4
:
if
len
(
RealDate
)
>
8
:
decision
=
datetime
.
strptime
(
RealDate
,
'
%
Y
%
b
%
d'
)
.
date
()
else
:
decision
=
datetime
.
strptime
(
RealDate
,
'
%
Y
%
b'
)
.
date
()
else
:
decision
=
datetime
.
strptime
(
PubmedDate
,
'
%
Y
%
m
%
d'
)
.
date
()
if
len
(
RealDate
)
>
8
:
try
:
Decision
=
datetime
.
strptime
(
RealDate
,
'
%
Y
%
b
%
d'
)
.
date
()
except
:
Decision
=
datetime
.
strptime
(
PubmedDate
,
'
%
Y
%
m
%
d'
)
.
date
()
else
:
try
:
Decision
=
datetime
.
strptime
(
RealDate
,
'
%
Y
%
b'
)
.
date
()
except
:
Decision
=
datetime
.
strptime
(
PubmedDate
,
'
%
Y
%
m
%
d'
)
.
date
()
else
:
Decision
=
datetime
.
strptime
(
PubmedDate
,
'
%
Y
%
m
%
d'
)
.
date
()
if
"publication_year"
in
metadata
:
metadata
[
"publication_year"
]
=
str
(
d
ecision
.
year
)
if
"publication_month"
in
metadata
:
metadata
[
"publication_month"
]
=
str
(
d
ecision
.
month
)
if
"publication_day"
in
metadata
:
metadata
[
"publication_day"
]
=
str
(
d
ecision
.
day
)
if
"publication_year"
in
metadata
:
metadata
[
"publication_year"
]
=
str
(
D
ecision
.
year
)
if
"publication_month"
in
metadata
:
metadata
[
"publication_month"
]
=
str
(
D
ecision
.
month
)
if
"publication_day"
in
metadata
:
metadata
[
"publication_day"
]
=
str
(
D
ecision
.
day
)
if
"realdate_year_"
in
metadata
:
metadata
.
pop
(
"realdate_year_"
)
if
"realdate_month_"
in
metadata
:
metadata
.
pop
(
"realdate_month_"
)
if
"realdate_day_"
in
metadata
:
metadata
.
pop
(
"realdate_day_"
)
if
"title2"
in
metadata
:
metadata
.
pop
(
"title2"
)
# print(metadata)
metadata_list
.
append
(
metadata
)
# return the list of metadata
return
metadata_list
parsing/FileParsers/__init__.py
View file @
d0003ef9
...
...
@@ -2,3 +2,4 @@ from parsing.FileParsers.RisFileParser import RisFileParser
from
parsing.FileParsers.IsiFileParser
import
IsiFileParser
from
parsing.FileParsers.PubmedFileParser
import
PubmedFileParser
from
parsing.FileParsers.EuropressFileParser
import
EuropressFileParser
from
parsing.FileParsers.ISText
import
ISText
scrap_pubmed/MedlineFetcherDavid2015.py
View file @
d0003ef9
...
...
@@ -10,29 +10,36 @@ import os
import
time
# import libxml2
from
lxml
import
etree
from
datetime
import
datetime
from
django.core.files
import
File
import
threading
from
queue
import
Queue
import
time
class
MedlineFetcher
:
def
__init__
(
self
):
self
.
queue_size
=
8
self
.
q
=
Queue
()
self
.
firstResults
=
[]
self
.
lock
=
threading
.
Lock
()
# lock to serialize console output
self
.
pubMedEutilsURL
=
'http://www.ncbi.nlm.nih.gov/entrez/eutils'
self
.
pubMedDB
=
'Pubmed'
self
.
reportType
=
'medline'
self
.
personalpath_mainPath
=
'MedLine/'
if
not
os
.
path
.
isdir
(
self
.
personalpath_mainPath
):
os
.
makedirs
(
self
.
personalpath_mainPath
)
print
(
'Created directory '
+
self
.
personalpath_mainPath
)
# Return the:
# Return the globalResults!:
# - count =
# - queryKey =
# - webEnv =
def
medlineEsearch
(
self
,
query
):
print
(
"MedlineFetcher::medlineEsearch :"
)
#
print ("MedlineFetcher::medlineEsearch :")
"Get number of results for query 'query' in variable 'count'"
"Get also 'queryKey' and 'webEnv', which are used by function 'medlineEfetch'"
origQuery
=
query
query
=
query
.
replace
(
' '
,
'
%20
'
)
eSearch
=
'
%
s/esearch.fcgi?db=
%
s&retmax=1&usehistory=y&term=
%
s'
%
(
self
.
pubMedEutilsURL
,
self
.
pubMedDB
,
query
)
...
...
@@ -50,13 +57,7 @@ class MedlineFetcher:
findwebenv
=
etree
.
XPath
(
"/eSearchResult/WebEnv/text()"
)
webEnv
=
findwebenv
(
root
)[
0
]
# doc = libxml2.parseDoc(data)
# count = doc.xpathEval('eSearchResult/Count/text()')[0]
# queryKey = doc.xpathEval('eSearchResult/QueryKey/text()')[0]
# webEnv = doc.xpathEval('eSearchResult/WebEnv/text()')[0]
# print count, queryKey, webEnv
values
=
{
"count"
:
int
(
str
(
count
)),
"queryKey"
:
queryKey
,
"webEnv"
:
webEnv
}
print
(
values
)
values
=
{
"query"
:
origQuery
,
"count"
:
int
(
str
(
count
)),
"queryKey"
:
queryKey
,
"webEnv"
:
webEnv
}
return
values
...
...
@@ -72,40 +73,58 @@ class MedlineFetcher:
queryKey
=
fullquery
[
"queryKey"
]
webEnv
=
fullquery
[
"webEnv"
]
print
(
"MedlineFetcher::medlineEfetchRAW :"
)
"Fetch medline result for query 'query', saving results to file every 'retmax' articles"
queryNoSpace
=
query
.
replace
(
' '
,
''
)
# No space in directory and file names, avoids stupid errors
# pubmedqueryfolder = personalpath.pubMedAbstractsPath + 'Pubmed_' + queryNoSpace
# if not os.path.isdir(pubmedqueryfolder):
# os.makedirs(pubmedqueryfolder)
pubMedResultFileName
=
self
.
personalpath_mainPath
+
'Pubmed_'
+
queryNoSpace
+
'.xml'
pubMedResultFile
=
open
(
pubMedResultFileName
,
'w'
)
print
(
'Query "'
,
query
,
'"
\t
:
\t
'
,
count
,
' results'
)
print
(
'Starting fetching at '
,
time
.
asctime
(
time
.
localtime
())
)
print
(
"LOG::TIME: "
,
'medlineEfetchRAW :Query "'
,
query
,
'"
\t
:
\t
'
,
count
,
' results'
)
retstart
=
0
# while(retstart < count):
eFetch
=
'
%
s/efetch.fcgi?email=youremail@example.org&rettype=
%
s&retmode=xml&retstart=
%
s&retmax=
%
s&db=
%
s&query_key=
%
s&WebEnv=
%
s'
%
(
self
.
pubMedEutilsURL
,
self
.
reportType
,
retstart
,
retmax
,
self
.
pubMedDB
,
queryKey
,
webEnv
)
return
eFetch
# if sys.version_info >= (3, 0): pubMedResultFile.write(eFetchResult.read().decode('utf-8'))
# else: pubMedResultFile.write(eFetchResult.read())
# retstart += retmax
# break # you shall not pass !!
# pubMedResultFile.close()
# print ('Fetching for query ' , query , ' finished at ' , time.asctime(time.localtime()) )
# print (retmax , ' results written to file ' , pubMedResultFileName , '\n' )
# print("------------------------------------------")
# return ["everything","ok"]
# generic!
def
downloadFile
(
self
,
item
):
url
=
item
[
0
]
filename
=
item
[
1
]
print
(
"
\t
in downloadFile:"
)
print
(
url
,
filename
)
data
=
urlopen
(
url
)
f
=
open
(
filename
,
'w'
)
myfile
=
File
(
f
)
myfile
.
write
(
data
.
read
()
.
decode
(
'utf-8'
)
)
myfile
.
close
()
f
.
close
()
with
self
.
lock
:
print
(
threading
.
current_thread
()
.
name
,
filename
+
" OK"
)
return
filename
# generic!
def
do_work
(
self
,
item
):
# time.sleep(1) # pretend to do some lengthy work.
returnvalue
=
self
.
medlineEsearch
(
item
)
with
self
.
lock
:
print
(
threading
.
current_thread
()
.
name
,
item
)
return
returnvalue
# The worker thread pulls an item from the queue and processes it
def
worker
(
self
):
while
True
:
item
=
self
.
q
.
get
()
self
.
firstResults
.
append
(
self
.
do_work
(
item
))
self
.
q
.
task_done
()
def
worker2
(
self
):
while
True
:
item
=
self
.
q
.
get
()
self
.
firstResults
.
append
(
self
.
downloadFile
(
item
))
self
.
q
.
task_done
()
def
chunks
(
self
,
l
,
n
):
print
(
"chunks:"
)
for
i
in
range
(
0
,
len
(
l
),
n
):
yield
l
[
i
:
i
+
n
]
# GLOBALLIMIT:
# I will retrieve this exact amount of publications.
...
...
@@ -115,22 +134,34 @@ class MedlineFetcher:
# - GlobalLimit : Number of publications i want.
def
serialFetcher
(
self
,
yearsNumber
,
query
,
globalLimit
):
# Create the queue and thread pool.
for
i
in
range
(
self
.
queue_size
):
t
=
threading
.
Thread
(
target
=
self
.
worker
)
t
.
daemon
=
True
# thread dies when main thread (only non-daemon thread) exits.
t
.
start
()
start
=
time
.
perf_counter
()
N
=
0
print
(
"MedlineFetcher::serialFetcher :"
)
thequeries
=
[]
globalresults
=
[]
for
i
in
range
(
yearsNumber
):
year
=
str
(
2015
-
i
)
print
(
'YEAR '
+
year
)
print
(
'---------
\n
'
)
# medlineEfetch(str(year) + '[dp] '+query , 20000)
# medlineEfetchRAW(str(year) + '[dp] '+query , retmax=300)
pubmedquery
=
str
(
year
)
+
'[dp] '
+
query
globalresults
=
self
.
medlineEsearch
(
pubmedquery
)
self
.
q
.
put
(
pubmedquery
)
#put task in the queue
self
.
q
.
join
()
print
(
'time:'
,
time
.
perf_counter
()
-
start
)
for
globalresults
in
self
.
firstResults
:
# globalresults = self.medlineEsearch(pubmedquery)
if
globalresults
[
"count"
]
>
0
:
N
+=
globalresults
[
"count"
]
querymetadata
=
{
"string"
:
pubmedquery
,
"string"
:
globalresults
[
"query"
]
,
"count"
:
globalresults
[
"count"
]
,
"queryKey"
:
globalresults
[
"queryKey"
]
,
"webEnv"
:
globalresults
[
"webEnv"
]
,
...
...
@@ -149,11 +180,3 @@ class MedlineFetcher:
query
[
"retmax"
]
=
retmax_forthisyear
return
thequeries
# serialFetcher(yearsNumber=3, 'microbiota' , globalLimit=100 )
# query = str(2015)+ '[dp] '+'microbiota'
# medlineEsearch( query )
#
scrap_pubmed/views.py
View file @
d0003ef9
...
...
@@ -14,6 +14,8 @@ import json
from
gargantext_web.settings
import
MEDIA_ROOT
from
datetime
import
datetime
import
time
import
threading
from
django.core.files
import
File
from
gargantext_web.settings
import
DEBUG
...
...
@@ -28,14 +30,16 @@ def getGlobalStats(request ):
if
request
.
method
==
"POST"
:
query
=
request
.
POST
[
"query"
]
print
(
"LOG::TIME: query ="
,
query
)
print
(
"LOG::TIME: N ="
,
300
)
instancia
=
MedlineFetcher
()
alist
=
instancia
.
serialFetcher
(
5
,
query
,
100
)
# alist = instancia.serialFetcher( 5, query , int(request.POST["N"]) )
alist
=
instancia
.
serialFetcher
(
5
,
query
,
300
)
data
=
alist
return
JsonHttpResponse
(
data
)
from
parsing.FileParsers
import
PubmedFileParser
def
doTheQuery
(
request
,
project_id
):
alist
=
[
"hola"
,
"mundo"
]
...
...
@@ -78,17 +82,20 @@ def doTheQuery(request , project_id):
corpus
.
save
()
try
:
tasks
=
MedlineFetcher
()
# configuring your queue with the event
for
i
in
range
(
8
):
t
=
threading
.
Thread
(
target
=
tasks
.
worker2
)
#thing to do
t
.
daemon
=
True
# thread dies when main thread (only non-daemon thread) exits.
t
.
start
()
for
url
in
urlreqs
:
print
(
url
)
data
=
urlopen
(
url
)
xmlname
=
MEDIA_ROOT
+
'/corpora/
%
s/
%
s.xml'
%
(
request
.
user
,
str
(
datetime
.
now
()
.
microsecond
))
f
=
open
(
xmlname
,
'w'
)
myfile
=
File
(
f
)
myfile
.
write
(
data
.
read
()
.
decode
(
'utf-8'
)
)
myfile
.
close
()
f
.
close
()
corpus
.
add_resource
(
user
=
request
.
user
,
type
=
resource_type
,
file
=
xmlname
)
filename
=
MEDIA_ROOT
+
'/corpora/
%
s/
%
s'
%
(
request
.
user
,
str
(
datetime
.
now
()
.
microsecond
))
tasks
.
q
.
put
(
[
url
,
filename
])
#put a task in th queue
tasks
.
q
.
join
()
# wait until everything is finished
for
filename
in
tasks
.
firstResults
:
corpus
.
add_resource
(
user
=
request
.
user
,
type
=
resource_type
,
file
=
filename
)
# do the WorkFlow
try
:
if
DEBUG
is
True
:
corpus
.
workflow
()
...
...
@@ -96,7 +103,6 @@ def doTheQuery(request , project_id):
corpus
.
workflow
.
apply_async
((),
countdown
=
3
)
return
JsonHttpResponse
([
"workflow"
,
"finished"
])
except
Exception
as
error
:
print
(
error
)
...
...
@@ -107,3 +113,78 @@ def doTheQuery(request , project_id):
data
=
alist
return
JsonHttpResponse
(
data
)
def
testISTEX
(
request
,
project_id
):
print
(
request
.
method
)
alist
=
[
"bar"
,
"foo"
]
if
request
.
method
==
"POST"
:
# print(alist)
query
=
"-"
query_string
=
"-"
N
=
60
if
"query"
in
request
.
POST
:
query
=
request
.
POST
[
"query"
]
if
"string"
in
request
.
POST
:
query_string
=
request
.
POST
[
"string"
]
.
replace
(
" "
,
"+"
)
# if "N" in request.POST: N = request.POST["N"]
print
(
query_string
,
query
,
N
)
urlreqs
=
[]
pagesize
=
50
tasks
=
MedlineFetcher
()
chunks
=
list
(
tasks
.
chunks
(
range
(
N
),
pagesize
))
for
k
in
chunks
:
if
(
k
[
0
]
+
pagesize
)
>
N
:
pagesize
=
N
-
k
[
0
]
urlreqs
.
append
(
"http://api.istex.fr/document/?q="
+
query_string
+
"&output=*&"
+
"from="
+
str
(
k
[
0
])
+
"&size="
+
str
(
pagesize
))
print
(
urlreqs
)
# urlreqs = ["http://localhost/374255" , "http://localhost/374278" ]
resource_type
=
ResourceType
.
objects
.
get
(
name
=
"istext"
)
parent
=
Node
.
objects
.
get
(
id
=
project_id
)
node_type
=
NodeType
.
objects
.
get
(
name
=
'Corpus'
)
type_id
=
NodeType
.
objects
.
get
(
name
=
'Document'
)
.
id
user_id
=
User
.
objects
.
get
(
username
=
request
.
user
)
.
id
corpus
=
Node
(
user
=
request
.
user
,
parent
=
parent
,
type
=
node_type
,
name
=
query
,
)
corpus
.
save
()
# configuring your queue with the event
for
i
in
range
(
8
):
t
=
threading
.
Thread
(
target
=
tasks
.
worker2
)
#thing to do
t
.
daemon
=
True
# thread dies when main thread (only non-daemon thread) exits.
t
.
start
()
for
url
in
urlreqs
:
filename
=
MEDIA_ROOT
+
'/corpora/
%
s/
%
s'
%
(
request
.
user
,
str
(
datetime
.
now
()
.
microsecond
))
tasks
.
q
.
put
(
[
url
,
filename
])
#put a task in th queue
tasks
.
q
.
join
()
# wait until everything is finished
for
filename
in
tasks
.
firstResults
:
corpus
.
add_resource
(
user
=
request
.
user
,
type
=
resource_type
,
file
=
filename
)
corpus
.
save
()
# do the WorkFlow
try
:
if
DEBUG
is
True
:
corpus
.
workflow
()
else
:
corpus
.
workflow
.
apply_async
((),
countdown
=
3
)
return
JsonHttpResponse
([
"workflow"
,
"finished"
])
except
Exception
as
error
:
print
(
error
)
data
=
[
query_string
,
query
,
N
]
return
JsonHttpResponse
(
data
)
templates/project.html
View file @
d0003ef9
This diff is collapsed.
Click to expand it.
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment