Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
gargantext
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
humanities
gargantext
Commits
fcd75235
Commit
fcd75235
authored
Mar 25, 2015
by
PkSM3
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
[UPDATE] last progress (nothing definitive)
parent
5f4f3e0b
Changes
4
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
146 additions
and
82 deletions
+146
-82
requirements.txt
init/requirements.txt
+2
-2
corpustools.py
parsing/corpustools.py
+32
-5
MedlineFetcherDavid2015.py
scrap_pubmed/MedlineFetcherDavid2015.py
+9
-8
views.py
scrap_pubmed/views.py
+103
-67
No files found.
init/requirements.txt
View file @
fcd75235
...
...
@@ -38,9 +38,9 @@ graphviz==0.4
ipython==2.2.0
kombu==3.0.23
lxml==3.4.1
matplotlib==1.4.0
#
matplotlib==1.4.0
networkx==1.9
nltk==3.0a4
#
nltk==3.0a4
nose==1.3.4
numpy==1.8.2
pandas==0.14.1
...
...
parsing/corpustools.py
View file @
fcd75235
...
...
@@ -206,34 +206,54 @@ def extract_ngrams(corpus, keys):
.
filter
(
Node
.
type_id
==
cache
.
NodeType
[
'Document'
]
.
id
)
)
# prepare data to be inserted
print
(
"
\n
= = = = = ="
)
dbg
.
show
(
'find ngrams'
)
print
(
'000001'
)
languages_by_id
=
{
language
.
id
:
language
.
iso2
for
language
in
session
.
query
(
Language
)
}
print
(
'000002'
)
ngrams_data
=
set
()
ngrams_language_data
=
set
()
ngrams_tag_data
=
set
()
print
(
'000003'
)
node_ngram_list
=
defaultdict
(
lambda
:
defaultdict
(
int
))
for
nodeinfo
in
metadata_query
:
print
(
'
\t
000004'
)
node_id
=
nodeinfo
[
0
]
language_id
=
nodeinfo
[
1
]
if
language_id
is
None
:
language_iso2
=
default_language_iso2
else
:
language_iso2
=
languages_by_id
.
get
(
language_id
,
None
)
if
language_iso2
is
None
:
continue
print
(
'
\t
000005'
)
print
(
'
\t
'
,
language_iso2
)
ngramsextractor
=
ngramsextractors
[
language_iso2
]
print
(
'
\t
'
,
ngramsextractor
)
print
(
'
\t
000006'
)
for
text
in
nodeinfo
[
2
:]:
if
text
is
not
None
and
len
(
text
):
print
(
'
\t\t
000007'
)
ngrams
=
ngramsextractor
.
extract_ngrams
(
text
.
replace
(
"["
,
""
)
.
replace
(
"]"
,
""
))
# print(ngrams)
print
(
'
\t\t
000008'
)
for
ngram
in
ngrams
:
print
(
'
\t\t\t
000009'
)
print
(
'
\t\t\t
'
,
ngram
)
n
=
len
(
ngram
)
print
(
'
\t\t\t
n:'
,
n
)
print
(
'
\t\t\t
000010'
)
terms
=
' '
.
join
([
token
for
token
,
tag
in
ngram
])
.
lower
()
print
(
'
\t\t\t
000011'
)
import
pprint
pprint
.
pprint
(
cache
.
Tag
)
# TODO BUG here
if
n
==
1
:
tag_id
=
cache
.
Tag
[
ngram
[
0
][
1
]]
.
id
...
...
@@ -243,13 +263,20 @@ def extract_ngrams(corpus, keys):
tag_id
=
cache
.
Tag
[
'NN'
]
.
id
#tag_id = 14
#print('tag_id_2', tag_id)
print
(
'
\t\t\t
000012'
)
node_ngram_list
[
node_id
][
terms
]
+=
1
print
(
'
\t\t\t
000013'
)
ngrams_data
.
add
((
n
,
terms
))
print
(
'
\t\t\t
000014'
)
ngrams_language_data
.
add
((
terms
,
language_id
))
print
(
'
\t\t\t
000015'
)
ngrams_tag_data
.
add
((
terms
,
tag_id
))
print
(
'
\t\t\t
000016'
)
print
(
'
\t\t
000018'
)
print
(
'
\t\t
000019'
)
# dbg.show('\t000007')
print
(
'000020'
)
# insert ngrams to temporary table
dbg
.
show
(
'find ids for the
%
d ngrams'
%
len
(
ngrams_data
))
db
,
cursor
=
get_cursor
()
...
...
@@ -320,10 +347,10 @@ def extract_ngrams(corpus, keys):
# commit to database
db
.
commit
()
print
(
"= = = = = =
\n
"
)
# tfidf calculation
def
compute_tfidf
(
corpus
):
dbg
=
DebugTime
(
'Corpus #
%
d - tfidf'
%
corpus
.
id
)
# compute terms frequency sum
...
...
scrap_pubmed/MedlineFetcherDavid2015.py
View file @
fcd75235
...
...
@@ -40,7 +40,7 @@ class MedlineFetcher:
"Get number of results for query 'query' in variable 'count'"
"Get also 'queryKey' and 'webEnv', which are used by function 'medlineEfetch'"
print
(
query
)
#
print(query)
origQuery
=
query
query
=
query
.
replace
(
' '
,
'
%20
'
)
...
...
@@ -79,7 +79,7 @@ class MedlineFetcher:
queryNoSpace
=
query
.
replace
(
' '
,
''
)
# No space in directory and file names, avoids stupid errors
print
(
"LOG::TIME: "
,
'medlineEfetchRAW :Query "'
,
query
,
'"
\t
:
\t
'
,
count
,
' results'
)
#
print ("LOG::TIME: ",'medlineEfetchRAW :Query "' , query , '"\t:\t' , count , ' results')
retstart
=
0
eFetch
=
'
%
s/efetch.fcgi?email=youremail@example.org&rettype=
%
s&retmode=xml&retstart=
%
s&retmax=
%
s&db=
%
s&query_key=
%
s&WebEnv=
%
s'
%
(
self
.
pubMedEutilsURL
,
self
.
reportType
,
retstart
,
retmax
,
self
.
pubMedDB
,
queryKey
,
webEnv
)
...
...
@@ -94,7 +94,7 @@ class MedlineFetcher:
def
downloadFile
(
self
,
item
):
url
=
item
[
0
]
filename
=
item
[
1
]
print
(
"
\t
in test_downloadFile:"
)
#
print("\tin test_downloadFile:")
# print(url,filename)
data
=
urlopen
(
url
)
f
=
codecs
.
open
(
filename
,
"w"
,
encoding
=
'utf-8'
)
...
...
@@ -110,7 +110,7 @@ class MedlineFetcher:
def
test_downloadFile
(
self
,
item
):
url
=
item
[
0
]
filename
=
item
[
1
]
print
(
"
\t
in downloadFile:"
)
#
print("\tin downloadFile:")
data
=
urlopen
(
url
)
return
data
...
...
@@ -119,7 +119,7 @@ class MedlineFetcher:
# time.sleep(1) # pretend to do some lengthy work.
returnvalue
=
self
.
medlineEsearch
(
item
)
with
self
.
lock
:
print
(
threading
.
current_thread
()
.
name
,
item
)
#
print(threading.current_thread().name, item)
return
returnvalue
# The worker thread pulls an item from the queue and processes it
...
...
@@ -160,13 +160,13 @@ class MedlineFetcher:
N
=
0
print
(
"MedlineFetcher::serialFetcher :"
)
#
print ("MedlineFetcher::serialFetcher :")
thequeries
=
[]
globalresults
=
[]
for
i
in
range
(
yearsNumber
):
year
=
str
(
2015
-
i
)
print
(
'YEAR '
+
year
)
print
(
'---------
\n
'
)
#
print ('YEAR ' + year)
#
print ('---------\n')
pubmedquery
=
str
(
year
)
+
'[dp] '
+
query
self
.
q
.
put
(
pubmedquery
)
#put task in the queue
...
...
@@ -196,5 +196,6 @@ class MedlineFetcher:
retmax_forthisyear
=
int
(
round
(
globalLimit
*
proportion
))
query
[
"retmax"
]
=
retmax_forthisyear
if
query
[
"retmax"
]
==
0
:
query
[
"retmax"
]
+=
1
print
(
query
[
"string"
],
"
\t
["
,
k
,
">"
,
query
[
"retmax"
],
"]"
)
return
thequeries
scrap_pubmed/views.py
View file @
fcd75235
from
django.shortcuts
import
redirect
from
django.shortcuts
import
render
from
django.http
import
Http404
,
HttpResponse
,
HttpResponseRedirect
from
django.template.loader
import
get_template
from
django.template
import
Context
from
django.contrib.auth.models
import
User
,
Group
from
scrap_pubmed.MedlineFetcherDavid2015
import
MedlineFetcher
from
gargantext_web.api
import
JsonHttpResponse
from
urllib.request
import
urlopen
,
urlretrieve
import
json
from
gargantext_web.settings
import
MEDIA_ROOT
# from datetime import datetime
import
time
import
datetime
...
...
@@ -21,9 +16,23 @@ import threading
from
django.core.files
import
File
from
gargantext_web.settings
import
DEBUG
from
node.models
import
Language
,
ResourceType
,
Resource
,
\
Node
,
NodeType
,
Node_Resource
,
Project
,
Corpus
,
\
Ngram
,
Node_Ngram
,
NodeNgramNgram
,
NodeNodeNgram
from
django.shortcuts
import
redirect
from
django.shortcuts
import
render
from
django.http
import
Http404
,
HttpResponse
,
HttpResponseRedirect
,
HttpResponseForbidden
from
sqlalchemy
import
func
from
sqlalchemy.orm
import
aliased
from
collections
import
defaultdict
import
threading
from
node.admin
import
CustomForm
from
gargantext_web.db
import
*
from
gargantext_web.settings
import
DEBUG
,
MEDIA_ROOT
from
gargantext_web.api
import
JsonHttpResponse
from
parsing.corpustools
import
add_resource
,
parse_resources
,
extract_ngrams
,
compute_tfidf
def
getGlobalStats
(
request
):
...
...
@@ -31,7 +40,7 @@ def getGlobalStats(request ):
alist
=
[
"bar"
,
"foo"
]
if
request
.
method
==
"POST"
:
N
=
10
0
N
=
10
query
=
request
.
POST
[
"query"
]
print
(
"LOG::TIME:_ "
+
datetime
.
datetime
.
now
()
.
isoformat
()
+
" query ="
,
query
)
print
(
"LOG::TIME:_ "
+
datetime
.
datetime
.
now
()
.
isoformat
()
+
" N ="
,
N
)
...
...
@@ -72,17 +81,57 @@ def getGlobalStatsISTEXT(request ):
def
doTheQuery
(
request
,
project_id
):
alist
=
[
"hola"
,
"mundo"
]
if
request
.
method
==
"POST"
:
# query = request.POST["query"]
# name = request.POST["string"]
# SQLAlchemy session
session
=
Session
()
# instancia = MedlineFetcher()
# thequeries = json.loads(query)
# do we have a valid project id?
try
:
project_id
=
int
(
project_id
)
except
ValueError
:
raise
Http404
()
# urlreqs = []
# for yearquery in thequeries:
# urlreqs.append( instancia.medlineEfetchRAW( yearquery ) )
# alist = ["tudo fixe" , "tudo bem"]
# do we have a valid project?
project
=
(
session
.
query
(
Node
)
.
filter
(
Node
.
id
==
project_id
)
.
filter
(
Node
.
type_id
==
cache
.
NodeType
[
'Project'
]
.
id
)
)
.
first
()
if
project
is
None
:
raise
Http404
()
# do we have a valid user?
user
=
request
.
user
if
not
user
.
is_authenticated
():
return
redirect
(
'/login/?next=
%
s'
%
request
.
path
)
if
project
.
user_id
!=
user
.
id
:
return
HttpResponseForbidden
()
if
request
.
method
==
"POST"
:
query
=
request
.
POST
[
"query"
]
name
=
request
.
POST
[
"string"
]
instancia
=
MedlineFetcher
()
thequeries
=
json
.
loads
(
query
)
urlreqs
=
[]
for
yearquery
in
thequeries
:
urlreqs
.
append
(
instancia
.
medlineEfetchRAW
(
yearquery
)
)
alist
=
[
"tudo fixe"
,
"tudo bem"
]
resourcetype
=
cache
.
ResourceType
[
"pubmed"
]
# corpus node instanciation as a Django model
corpus
=
Node
(
name
=
name
,
user_id
=
request
.
user
.
id
,
parent_id
=
project_id
,
type_id
=
cache
.
NodeType
[
'Corpus'
]
.
id
,
language_id
=
None
,
)
session
.
add
(
corpus
)
session
.
commit
()
# """
# urlreqs: List of urls to query.
...
...
@@ -91,57 +140,44 @@ def doTheQuery(request , project_id):
# eFetchResult.read() # this will output the XML... normally you write this to a XML-file.
# """
# thefile = "how we do this here?"
# resource_type = ResourceType.objects.get(name="pubmed" )
# parent = Node.objects.get(id=project_id)
# node_type = NodeType.objects.get(name='Corpus')
# type_id = NodeType.objects.get(name='Document').id
# user_id = User.objects.get( username=request.user ).id
# corpus = Node(
# user=request.user,
# parent=parent,
# type=node_type,
# name=name,
# )
# corpus.save()
# tasks = MedlineFetcher()
# for i in range(8):
# t = threading.Thread(target=tasks.worker2) #thing to do
# t.daemon = True # thread dies when main thread (only non-daemon thread) exits.
# t.start()
# for url in urlreqs:
# filename = MEDIA_ROOT + '/corpora/%s/%s' % (request.user, str(datetime.datetime.now().isoformat()))
# tasks.q.put( [url , filename]) #put a task in th queue
# tasks.q.join() # wait until everything is finished
# dwnldsOK = 0
# for filename in tasks.firstResults:
# if filename!=False:
# corpus.add_resource( user=request.user, type=resource_type, file=filename )
# dwnldsOK+=1
tasks
=
MedlineFetcher
()
for
i
in
range
(
8
):
t
=
threading
.
Thread
(
target
=
tasks
.
worker2
)
#thing to do
t
.
daemon
=
True
# thread dies when main thread (only non-daemon thread) exits.
t
.
start
()
for
url
in
urlreqs
:
filename
=
MEDIA_ROOT
+
'/corpora/
%
s/
%
s'
%
(
request
.
user
,
str
(
datetime
.
datetime
.
now
()
.
isoformat
()))
tasks
.
q
.
put
(
[
url
,
filename
])
#put a task in th queue
tasks
.
q
.
join
()
# wait until everything is finished
dwnldsOK
=
0
for
filename
in
tasks
.
firstResults
:
if
filename
!=
False
:
# add the uploaded resource to the corpus
add_resource
(
corpus
,
user_id
=
request
.
user
.
id
,
type_id
=
resourcetype
.
id
,
file
=
filename
,
)
dwnldsOK
+=
1
# if dwnldsOK == 0: return JsonHttpResponse(["fail"])
# # do the WorkFlow
# try:
# if DEBUG is True:
# # corpus.workflow() # old times...
# corpus.workflow__MOV()
# # corpus.write_everything_to_DB()
# else:
# # corpus.workflow.apply_async((), countdown=3)
# corpus.workflow__MOV().apply_async((), countdown=3) # synchronous! because is faaast
# # corpus.write_everything_to_DB.apply_async((), countdown=3) # asynchronous
# return JsonHttpResponse(["workflow","finished"])
# except Exception as error:
# print(error)
return
JsonHttpResponse
([
"out of service for the moment"
])
if
dwnldsOK
==
0
:
return
JsonHttpResponse
([
"fail"
])
try
:
parse_resources
(
corpus
)
except
Exception
as
error
:
print
(
"!OK parse:"
,
error
)
try
:
extract_ngrams
(
corpus
,
[
'title'
])
except
Exception
as
error
:
print
(
"!OK ngrams:"
,
error
)
# try: compute_tfidf(corpus)
# except Exception as error: print("!OK tfidf:",error)
# # except Exception as error:
# # print('WORKFLOW ERROR')
# # print(error)
# # # redirect to the main project page
return
HttpResponseRedirect
(
'/project/'
+
str
(
project_id
))
data
=
alist
return
JsonHttpResponse
(
data
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment