Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
gargantext
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
humanities
gargantext
Commits
2b36c5d0
Commit
2b36c5d0
authored
Mar 20, 2015
by
PkSM3
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
[UPDATE] pubmed|istex removal (for the moment)
parent
fc089d47
Changes
2
Expand all
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
117 additions
and
456 deletions
+117
-456
models.py
node/models.py
+0
-339
views.py
scrap_pubmed/views.py
+117
-117
No files found.
node/models.py
View file @
2b36c5d0
This diff is collapsed.
Click to expand it.
scrap_pubmed/views.py
View file @
2b36c5d0
...
...
@@ -73,75 +73,75 @@ def doTheQuery(request , project_id):
alist
=
[
"hola"
,
"mundo"
]
if
request
.
method
==
"POST"
:
query
=
request
.
POST
[
"query"
]
name
=
request
.
POST
[
"string"
]
instancia
=
MedlineFetcher
()
thequeries
=
json
.
loads
(
query
)
urlreqs
=
[]
for
yearquery
in
thequeries
:
urlreqs
.
append
(
instancia
.
medlineEfetchRAW
(
yearquery
)
)
alist
=
[
"tudo fixe"
,
"tudo bem"
]
"""
urlreqs: List of urls to query.
- Then, to each url in urlreqs you do:
eFetchResult = urlopen(url)
eFetchResult.read() # this will output the XML... normally you write this to a XML-file.
"""
thefile
=
"how we do this here?"
resource_type
=
ResourceType
.
objects
.
get
(
name
=
"pubmed"
)
parent
=
Node
.
objects
.
get
(
id
=
project_id
)
node_type
=
NodeType
.
objects
.
get
(
name
=
'Corpus'
)
type_id
=
NodeType
.
objects
.
get
(
name
=
'Document'
)
.
id
user_id
=
User
.
objects
.
get
(
username
=
request
.
user
)
.
id
corpus
=
Node
(
user
=
request
.
user
,
parent
=
parent
,
type
=
node_type
,
name
=
name
,
)
corpus
.
save
()
tasks
=
MedlineFetcher
()
for
i
in
range
(
8
):
t
=
threading
.
Thread
(
target
=
tasks
.
worker2
)
#thing to do
t
.
daemon
=
True
# thread dies when main thread (only non-daemon thread) exits.
t
.
start
()
for
url
in
urlreqs
:
filename
=
MEDIA_ROOT
+
'/corpora/
%
s/
%
s'
%
(
request
.
user
,
str
(
datetime
.
datetime
.
now
()
.
isoformat
()))
tasks
.
q
.
put
(
[
url
,
filename
])
#put a task in th queue
tasks
.
q
.
join
()
# wait until everything is finished
dwnldsOK
=
0
for
filename
in
tasks
.
firstResults
:
if
filename
!=
False
:
corpus
.
add_resource
(
user
=
request
.
user
,
type
=
resource_type
,
file
=
filename
)
dwnldsOK
+=
1
#
query = request.POST["query"]
#
name = request.POST["string"]
#
instancia = MedlineFetcher()
#
thequeries = json.loads(query)
#
urlreqs = []
#
for yearquery in thequeries:
#
urlreqs.append( instancia.medlineEfetchRAW( yearquery ) )
#
alist = ["tudo fixe" , "tudo bem"]
#
"""
#
urlreqs: List of urls to query.
#
- Then, to each url in urlreqs you do:
#
eFetchResult = urlopen(url)
#
eFetchResult.read() # this will output the XML... normally you write this to a XML-file.
#
"""
#
thefile = "how we do this here?"
#
resource_type = ResourceType.objects.get(name="pubmed" )
#
parent = Node.objects.get(id=project_id)
#
node_type = NodeType.objects.get(name='Corpus')
#
type_id = NodeType.objects.get(name='Document').id
#
user_id = User.objects.get( username=request.user ).id
#
corpus = Node(
#
user=request.user,
#
parent=parent,
#
type=node_type,
#
name=name,
#
)
#
corpus.save()
#
tasks = MedlineFetcher()
#
for i in range(8):
#
t = threading.Thread(target=tasks.worker2) #thing to do
#
t.daemon = True # thread dies when main thread (only non-daemon thread) exits.
#
t.start()
#
for url in urlreqs:
#
filename = MEDIA_ROOT + '/corpora/%s/%s' % (request.user, str(datetime.datetime.now().isoformat()))
#
tasks.q.put( [url , filename]) #put a task in th queue
#
tasks.q.join() # wait until everything is finished
#
dwnldsOK = 0
#
for filename in tasks.firstResults:
#
if filename!=False:
#
corpus.add_resource( user=request.user, type=resource_type, file=filename )
#
dwnldsOK+=1
if
dwnldsOK
==
0
:
return
JsonHttpResponse
([
"fail"
])
#
if dwnldsOK == 0: return JsonHttpResponse(["fail"])
# do the WorkFlow
try
:
if
DEBUG
is
True
:
# corpus.workflow() # old times...
corpus
.
workflow__MOV
()
# corpus.write_everything_to_DB()
else
:
# corpus.workflow.apply_async((), countdown=3)
corpus
.
workflow__MOV
()
.
apply_async
((),
countdown
=
3
)
# synchronous! because is faaast
# corpus.write_everything_to_DB.apply_async((), countdown=3) # asynchronous
#
#
do the WorkFlow
#
try:
#
if DEBUG is True:
#
# corpus.workflow() # old times...
#
corpus.workflow__MOV()
#
# corpus.write_everything_to_DB()
#
else:
#
# corpus.workflow.apply_async((), countdown=3)
#
corpus.workflow__MOV().apply_async((), countdown=3) # synchronous! because is faaast
#
# corpus.write_everything_to_DB.apply_async((), countdown=3) # asynchronous
return
JsonHttpResponse
([
"workflow"
,
"finished"
])
except
Exception
as
error
:
print
(
error
)
#
return JsonHttpResponse(["workflow","finished"])
#
except Exception as error:
#
print(error)
return
JsonHttpResponse
([
"
workflow"
,
"finished"
,
"outside the try-excep
t"
])
return
JsonHttpResponse
([
"
out of service for the momen
t"
])
data
=
alist
return
JsonHttpResponse
(
data
)
...
...
@@ -164,59 +164,59 @@ def testISTEX(request , project_id):
print
(
query_string
,
query
,
N
)
urlreqs
=
[]
pagesize
=
50
tasks
=
MedlineFetcher
()
chunks
=
list
(
tasks
.
chunks
(
range
(
N
),
pagesize
))
for
k
in
chunks
:
if
(
k
[
0
]
+
pagesize
)
>
N
:
pagesize
=
N
-
k
[
0
]
urlreqs
.
append
(
"http://api.istex.fr/document/?q="
+
query_string
+
"&output=*&"
+
"from="
+
str
(
k
[
0
])
+
"&size="
+
str
(
pagesize
))
print
(
urlreqs
)
urlreqs
=
[
"http://localhost/374255"
,
"http://localhost/374278"
]
print
(
urlreqs
)
resource_type
=
ResourceType
.
objects
.
get
(
name
=
"istext"
)
parent
=
Node
.
objects
.
get
(
id
=
project_id
)
node_type
=
NodeType
.
objects
.
get
(
name
=
'Corpus'
)
type_id
=
NodeType
.
objects
.
get
(
name
=
'Document'
)
.
id
user_id
=
User
.
objects
.
get
(
username
=
request
.
user
)
.
id
corpus
=
Node
(
user
=
request
.
user
,
parent
=
parent
,
type
=
node_type
,
name
=
query
,
)
corpus
.
save
()
# configuring your queue with the event
for
i
in
range
(
8
):
t
=
threading
.
Thread
(
target
=
tasks
.
worker2
)
#thing to do
t
.
daemon
=
True
# thread dies when main thread (only non-daemon thread) exits.
t
.
start
()
for
url
in
urlreqs
:
filename
=
MEDIA_ROOT
+
'/corpora/
%
s/
%
s'
%
(
request
.
user
,
str
(
datetime
.
now
()
.
microsecond
))
tasks
.
q
.
put
(
[
url
,
filename
])
#put a task in th queue
tasks
.
q
.
join
()
# wait until everything is finished
for
filename
in
tasks
.
firstResults
:
corpus
.
add_resource
(
user
=
request
.
user
,
type
=
resource_type
,
file
=
filename
)
corpus
.
save
()
print
(
"DEBUG:"
,
DEBUG
)
# do the WorkFlow
try
:
if
DEBUG
is
True
:
corpus
.
workflow
()
else
:
corpus
.
workflow
.
apply_async
((),
countdown
=
3
)
return
JsonHttpResponse
([
"workflow"
,
"finished"
])
except
Exception
as
error
:
print
(
error
)
#
urlreqs = []
#
pagesize = 50
#
tasks = MedlineFetcher()
#
chunks = list(tasks.chunks(range(N), pagesize))
#
for k in chunks:
#
if (k[0]+pagesize)>N: pagesize = N-k[0]
#
urlreqs.append("http://api.istex.fr/document/?q="+query_string+"&output=*&"+"from="+str(k[0])+"&size="+str(pagesize))
#
print(urlreqs)
#
urlreqs = ["http://localhost/374255" , "http://localhost/374278" ]
#
print(urlreqs)
#
resource_type = ResourceType.objects.get(name="istext" )
#
parent = Node.objects.get(id=project_id)
#
node_type = NodeType.objects.get(name='Corpus')
#
type_id = NodeType.objects.get(name='Document').id
#
user_id = User.objects.get( username=request.user ).id
#
corpus = Node(
#
user=request.user,
#
parent=parent,
#
type=node_type,
#
name=query,
#
)
#
corpus.save()
#
#
configuring your queue with the event
#
for i in range(8):
#
t = threading.Thread(target=tasks.worker2) #thing to do
#
t.daemon = True # thread dies when main thread (only non-daemon thread) exits.
#
t.start()
#
for url in urlreqs:
#
filename = MEDIA_ROOT + '/corpora/%s/%s' % (request.user, str(datetime.now().microsecond))
#
tasks.q.put( [url , filename]) #put a task in th queue
#
tasks.q.join() # wait until everything is finished
#
for filename in tasks.firstResults:
#
corpus.add_resource( user=request.user, type=resource_type, file=filename )
#
corpus.save()
#
print("DEBUG:",DEBUG)
#
#
do the WorkFlow
#
try:
#
if DEBUG is True:
#
corpus.workflow()
#
else:
#
corpus.workflow.apply_async((), countdown=3)
#
return JsonHttpResponse(["workflow","finished"])
#
except Exception as error:
#
print(error)
data
=
[
query_string
,
query
,
N
]
return
JsonHttpResponse
(
data
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment