Commit fcd75235 authored by PkSM3's avatar PkSM3

[UPDATE] last progress (nothing definitive)

parent 5f4f3e0b
......@@ -38,9 +38,9 @@ graphviz==0.4
ipython==2.2.0
kombu==3.0.23
lxml==3.4.1
matplotlib==1.4.0
#matplotlib==1.4.0
networkx==1.9
nltk==3.0a4
#nltk==3.0a4
nose==1.3.4
numpy==1.8.2
pandas==0.14.1
......
......@@ -206,34 +206,54 @@ def extract_ngrams(corpus, keys):
.filter(Node.type_id == cache.NodeType['Document'].id)
)
# prepare data to be inserted
print("\n= = = = = =")
dbg.show('find ngrams')
print('000001')
languages_by_id = {
language.id: language.iso2
for language in session.query(Language)
}
print('000002')
ngrams_data = set()
ngrams_language_data = set()
ngrams_tag_data = set()
print('000003')
node_ngram_list = defaultdict(lambda: defaultdict(int))
for nodeinfo in metadata_query:
print('\t000004')
node_id = nodeinfo[0]
language_id = nodeinfo[1]
if language_id is None:
language_iso2 = default_language_iso2
else:
language_iso2 = languages_by_id.get(language_id, None)
if language_iso2 is None:
continue
print('\t000005')
print('\t',language_iso2)
ngramsextractor = ngramsextractors[language_iso2]
print('\t',ngramsextractor)
print('\t000006')
for text in nodeinfo[2:]:
if text is not None and len(text):
print('\t\t000007')
ngrams = ngramsextractor.extract_ngrams(text.replace("[","").replace("]",""))
# print(ngrams)
print('\t\t000008')
for ngram in ngrams:
print('\t\t\t000009')
print('\t\t\t',ngram)
n = len(ngram)
print('\t\t\tn:',n)
print('\t\t\t000010')
terms = ' '.join([token for token, tag in ngram]).lower()
print('\t\t\t000011')
import pprint
pprint.pprint(cache.Tag)
# TODO BUG here
if n == 1:
tag_id = cache.Tag[ngram[0][1]].id
......@@ -243,13 +263,20 @@ def extract_ngrams(corpus, keys):
tag_id = cache.Tag['NN'].id
#tag_id = 14
#print('tag_id_2', tag_id)
print('\t\t\t000012')
node_ngram_list[node_id][terms] += 1
print('\t\t\t000013')
ngrams_data.add((n, terms))
print('\t\t\t000014')
ngrams_language_data.add((terms, language_id))
print('\t\t\t000015')
ngrams_tag_data.add((terms, tag_id))
print('\t\t\t000016')
print('\t\t000018')
print('\t\t000019')
# dbg.show('\t000007')
print('000020')
# insert ngrams to temporary table
dbg.show('find ids for the %d ngrams' % len(ngrams_data))
db, cursor = get_cursor()
......@@ -320,10 +347,10 @@ def extract_ngrams(corpus, keys):
# commit to database
db.commit()
print("= = = = = =\n")
# tfidf calculation
def compute_tfidf(corpus):
dbg = DebugTime('Corpus #%d - tfidf' % corpus.id)
# compute terms frequency sum
......
......@@ -40,7 +40,7 @@ class MedlineFetcher:
"Get number of results for query 'query' in variable 'count'"
"Get also 'queryKey' and 'webEnv', which are used by function 'medlineEfetch'"
print(query)
# print(query)
origQuery = query
query = query.replace(' ', '%20')
......@@ -79,7 +79,7 @@ class MedlineFetcher:
queryNoSpace = query.replace(' ', '') # No space in directory and file names, avoids stupid errors
print ("LOG::TIME: ",'medlineEfetchRAW :Query "' , query , '"\t:\t' , count , ' results')
# print ("LOG::TIME: ",'medlineEfetchRAW :Query "' , query , '"\t:\t' , count , ' results')
retstart = 0
eFetch = '%s/efetch.fcgi?email=youremail@example.org&rettype=%s&retmode=xml&retstart=%s&retmax=%s&db=%s&query_key=%s&WebEnv=%s' %(self.pubMedEutilsURL, self.reportType, retstart, retmax, self.pubMedDB, queryKey, webEnv)
......@@ -94,7 +94,7 @@ class MedlineFetcher:
def downloadFile(self, item):
url = item[0]
filename = item[1]
print("\tin test_downloadFile:")
# print("\tin test_downloadFile:")
# print(url,filename)
data = urlopen(url)
f = codecs.open(filename, "w" ,encoding='utf-8')
......@@ -110,7 +110,7 @@ class MedlineFetcher:
def test_downloadFile(self, item):
url = item[0]
filename = item[1]
print("\tin downloadFile:")
# print("\tin downloadFile:")
data = urlopen(url)
return data
......@@ -119,7 +119,7 @@ class MedlineFetcher:
# time.sleep(1) # pretend to do some lengthy work.
returnvalue = self.medlineEsearch(item)
with self.lock:
print(threading.current_thread().name, item)
# print(threading.current_thread().name, item)
return returnvalue
# The worker thread pulls an item from the queue and processes it
......@@ -160,13 +160,13 @@ class MedlineFetcher:
N = 0
print ("MedlineFetcher::serialFetcher :")
# print ("MedlineFetcher::serialFetcher :")
thequeries = []
globalresults = []
for i in range(yearsNumber):
year = str(2015 - i)
print ('YEAR ' + year)
print ('---------\n')
# print ('YEAR ' + year)
# print ('---------\n')
pubmedquery = str(year) + '[dp] '+query
self.q.put( pubmedquery ) #put task in the queue
......@@ -196,5 +196,6 @@ class MedlineFetcher:
retmax_forthisyear = int(round(globalLimit*proportion))
query["retmax"] = retmax_forthisyear
if query["retmax"]==0: query["retmax"]+=1
print(query["string"],"\t[",k,">",query["retmax"],"]")
return thequeries
from django.shortcuts import redirect
from django.shortcuts import render
from django.http import Http404, HttpResponse, HttpResponseRedirect
from django.template.loader import get_template
from django.template import Context
from django.contrib.auth.models import User, Group
from scrap_pubmed.MedlineFetcherDavid2015 import MedlineFetcher
from gargantext_web.api import JsonHttpResponse
from urllib.request import urlopen, urlretrieve
import json
from gargantext_web.settings import MEDIA_ROOT
# from datetime import datetime
import time
import datetime
......@@ -21,9 +16,23 @@ import threading
from django.core.files import File
from gargantext_web.settings import DEBUG
from node.models import Language, ResourceType, Resource, \
Node, NodeType, Node_Resource, Project, Corpus, \
Ngram, Node_Ngram, NodeNgramNgram, NodeNodeNgram
from django.shortcuts import redirect
from django.shortcuts import render
from django.http import Http404, HttpResponse, HttpResponseRedirect, HttpResponseForbidden
from sqlalchemy import func
from sqlalchemy.orm import aliased
from collections import defaultdict
import threading
from node.admin import CustomForm
from gargantext_web.db import *
from gargantext_web.settings import DEBUG, MEDIA_ROOT
from gargantext_web.api import JsonHttpResponse
from parsing.corpustools import add_resource, parse_resources, extract_ngrams, compute_tfidf
def getGlobalStats(request ):
......@@ -31,7 +40,7 @@ def getGlobalStats(request ):
alist = ["bar","foo"]
if request.method == "POST":
N = 100
N = 10
query = request.POST["query"]
print ("LOG::TIME:_ "+datetime.datetime.now().isoformat()+" query =", query )
print ("LOG::TIME:_ "+datetime.datetime.now().isoformat()+" N =", N )
......@@ -72,17 +81,57 @@ def getGlobalStatsISTEXT(request ):
def doTheQuery(request , project_id):
alist = ["hola","mundo"]
if request.method == "POST":
# query = request.POST["query"]
# name = request.POST["string"]
# SQLAlchemy session
session = Session()
# instancia = MedlineFetcher()
# thequeries = json.loads(query)
# do we have a valid project id?
try:
project_id = int(project_id)
except ValueError:
raise Http404()
# urlreqs = []
# for yearquery in thequeries:
# urlreqs.append( instancia.medlineEfetchRAW( yearquery ) )
# alist = ["tudo fixe" , "tudo bem"]
# do we have a valid project?
project = (session
.query(Node)
.filter(Node.id == project_id)
.filter(Node.type_id == cache.NodeType['Project'].id)
).first()
if project is None:
raise Http404()
# do we have a valid user?
user = request.user
if not user.is_authenticated():
return redirect('/login/?next=%s' % request.path)
if project.user_id != user.id:
return HttpResponseForbidden()
if request.method == "POST":
query = request.POST["query"]
name = request.POST["string"]
instancia = MedlineFetcher()
thequeries = json.loads(query)
urlreqs = []
for yearquery in thequeries:
urlreqs.append( instancia.medlineEfetchRAW( yearquery ) )
alist = ["tudo fixe" , "tudo bem"]
resourcetype = cache.ResourceType["pubmed"]
# corpus node instanciation as a Django model
corpus = Node(
name = name,
user_id = request.user.id,
parent_id = project_id,
type_id = cache.NodeType['Corpus'].id,
language_id = None,
)
session.add(corpus)
session.commit()
# """
# urlreqs: List of urls to query.
......@@ -91,57 +140,44 @@ def doTheQuery(request , project_id):
# eFetchResult.read() # this will output the XML... normally you write this to a XML-file.
# """
# thefile = "how we do this here?"
# resource_type = ResourceType.objects.get(name="pubmed" )
# parent = Node.objects.get(id=project_id)
# node_type = NodeType.objects.get(name='Corpus')
# type_id = NodeType.objects.get(name='Document').id
# user_id = User.objects.get( username=request.user ).id
# corpus = Node(
# user=request.user,
# parent=parent,
# type=node_type,
# name=name,
# )
# corpus.save()
# tasks = MedlineFetcher()
# for i in range(8):
# t = threading.Thread(target=tasks.worker2) #thing to do
# t.daemon = True # thread dies when main thread (only non-daemon thread) exits.
# t.start()
# for url in urlreqs:
# filename = MEDIA_ROOT + '/corpora/%s/%s' % (request.user, str(datetime.datetime.now().isoformat()))
# tasks.q.put( [url , filename]) #put a task in th queue
# tasks.q.join() # wait until everything is finished
# dwnldsOK = 0
# for filename in tasks.firstResults:
# if filename!=False:
# corpus.add_resource( user=request.user, type=resource_type, file=filename )
# dwnldsOK+=1
tasks = MedlineFetcher()
for i in range(8):
t = threading.Thread(target=tasks.worker2) #thing to do
t.daemon = True # thread dies when main thread (only non-daemon thread) exits.
t.start()
for url in urlreqs:
filename = MEDIA_ROOT + '/corpora/%s/%s' % (request.user, str(datetime.datetime.now().isoformat()))
tasks.q.put( [url , filename]) #put a task in th queue
tasks.q.join() # wait until everything is finished
dwnldsOK = 0
for filename in tasks.firstResults:
if filename!=False:
# add the uploaded resource to the corpus
add_resource(corpus,
user_id = request.user.id,
type_id = resourcetype.id,
file = filename,
)
dwnldsOK+=1
# if dwnldsOK == 0: return JsonHttpResponse(["fail"])
# # do the WorkFlow
# try:
# if DEBUG is True:
# # corpus.workflow() # old times...
# corpus.workflow__MOV()
# # corpus.write_everything_to_DB()
# else:
# # corpus.workflow.apply_async((), countdown=3)
# corpus.workflow__MOV().apply_async((), countdown=3) # synchronous! because is faaast
# # corpus.write_everything_to_DB.apply_async((), countdown=3) # asynchronous
# return JsonHttpResponse(["workflow","finished"])
# except Exception as error:
# print(error)
return JsonHttpResponse(["out of service for the moment"])
if dwnldsOK == 0: return JsonHttpResponse(["fail"])
try: parse_resources(corpus)
except Exception as error: print("!OK parse:",error)
try: extract_ngrams(corpus, ['title'])
except Exception as error: print("!OK ngrams:",error)
# try: compute_tfidf(corpus)
# except Exception as error: print("!OK tfidf:",error)
# # except Exception as error:
# # print('WORKFLOW ERROR')
# # print(error)
# # # redirect to the main project page
return HttpResponseRedirect('/project/' + str(project_id))
data = alist
return JsonHttpResponse(data)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment