[UPDATE] pubmed|istex removal (for the moment)

2b36c5d0 · PkSM3 · fc089d47 · 2b36c5d0 · 2b36c5d0
Commit 2b36c5d0 authored 10 years ago by PkSM3
Hide whitespace changes
Inline Side-by-side

Showing with 117 additions and 456 deletions

models.py node/models.py +0 -339

views.py scrap_pubmed/views.py +117 -117

No files found.
--- a/node/models.py
+++ b/node/models.py
@@ -287,346 +287,7 @@ class Node(CTENode):
        print("In workflow() END")
        self.metadata['Processing'] = 0
        self.save()
-    def runInParallel(self, *fns):
-        proc = []
-        for fn in fns:
-            p = Process(target=fn)
-            p.start()
-            proc.append(p)
-        for p in proc:
-            p.join()
-    def parse_resources__MOV(self, verbose=False):
-        # parse all resources into a list of metadata
-        metadata_list = []
-        print("not parsed resources:")
-        print(self.node_resource.filter(parsed=False))
-        print("= = = = = = = = =  = =\n")
-        for node_resource in self.node_resource.filter(parsed=False):
-            resource = node_resource.resource
-            parser = defaultdict(lambda:FileParser.FileParser, {
-                'istext'    : ISText,
-                'pubmed'    : PubmedFileParser,
-                'isi'       : IsiFileParser,
-                'ris'       : RisFileParser,
-                'europress' : EuropressFileParser,
-                'europress_french'  : EuropressFileParser,
-                'europress_english' : EuropressFileParser,
-            })[resource.type.name]()
-            metadata_list += parser.parse(str(resource.file))
-        self.node_resource.update(parsed=True) #writing to DB
-        return metadata_list
-    def writeMetadata__MOV(self, metadata_list=None , verbose=False):
-        type_id = NodeType.objects.get(name='Document').id
-        user_id = self.user.id
-        langages_cache = LanguagesCache()
-        # # insert the new resources in the database!
-        for i, metadata_values in enumerate(metadata_list):
-            name = metadata_values.get('title', '')[:200]
-            language = langages_cache[metadata_values['language_iso2']] if 'language_iso2' in metadata_values else None,
-            if isinstance(language, tuple):
-                language = language[0]
-            try:
-                node = Node(
-                    user_id  = user_id,
-                    type_id  = type_id,
-                    name     = name,
-                    parent   = self,
-                    language_id = language.id if language else None,
-                    metadata = metadata_values
-                )
-                node.save()
-            except Exception as e:
-                print("ERR::node.models.writeMetadata__MOV ")
-            metadata_values["id"] = node.id
-        # # make metadata filterable
-        self.children.all().make_metadata_filterable()
-        # # mark the resources as parsed for this node
-        self.node_resource.update(parsed=True)
-        return metadata_list
-    def extract_ngrams__MOV(self, array , keys , ngramsextractorscache=None, ngramscaches=None):
-        if ngramsextractorscache is None:
-            ngramsextractorscache = NgramsExtractorsCache()
-        langages_cache = LanguagesCache()
-        if ngramscaches is None:
-            ngramscaches = NgramsCaches()
-        results = []
-        i = 0
-        for metadata in array:
-            associations = defaultdict(float) # float or int?
-            language = langages_cache[metadata['language_iso2']] if 'language_iso2' in metadata else None,
-            if isinstance(language, tuple):
-                language = language[0]
-            extractor = ngramsextractorscache[language]
-            ngrams = ngramscaches[language]
-            # theText = []
-            if isinstance(keys, dict):
-                for key, weight in keys.items():
-                    if key in metadata:
-                        text2process = str(metadata[key]).replace('[','').replace(']','')
-                        # theText.append(text2process)
-                        for ngram in extractor.extract_ngrams(text2process):
-                            terms = ' '.join([token for token, tag in ngram]).strip().lower()
-                            associations[ngram] += weight
-            else:
-                for key in keys:
-                    if key in metadata:
-                        text2process = str(metadata[key]).replace('[','').replace(']','')
-                        # theText.append(text2process)
-                        for ngram in extractor.extract_ngrams(text2process):
-                            terms = ' '.join([token for token, tag in ngram]).strip().lower()
-                            associations[terms] += 1
-            if(len(associations)>0):
-                results.append( [metadata["id"] , associations] )
-            i+=1
-        return results
-    def do_tfidf__MOV( self, FreqList , Metadata , n=150):
-        from analysis.InterUnion import Utils
-        calc = Utils()
-        # *01* [ calculating global count of each ngram ]
-        GlobalCount = {}
-        for i in FreqList:
-            docID = i[0]
-            associations = i[1]
-            for ngram_text, weight in associations.items():
-                if len(ngram_text.split())>1 and len(ngram_text.split())<4:# considering just {2,3}-grams
-                    if ngram_text in GlobalCount: 
-                        if "C" in GlobalCount[ngram_text]:
-                            GlobalCount[ngram_text]["C"] += 1
-                        else: 
-                            GlobalCount[ngram_text] = {}
-                            GlobalCount[ngram_text]["C"] = 1
-                    else:
-                        GlobalCount[ngram_text] = {}
-                        GlobalCount[ngram_text]["C"] = 1
-        # *01* [ / calculating global count of each ngram ]
-        # *02* [ Considering the first <150 ngrams by DESC occurrences ]
-        FirstNgrams = {}
-        sortedList = sorted(GlobalCount, key=lambda x: GlobalCount[x]['C'], reverse=True)
-        for i in range(len(sortedList)):
-            term = sortedList[i]
-            FirstNgrams[term] = {}
-            FirstNgrams[term]["C"] = GlobalCount[term]["C"]
-            if i==(n-1): break
-        # *02* [ / Considering the first <150 ngrams by DESC occurrences ]
-        N = float(len(FreqList)) #nro docs really processed
-        # *03* [ making dictionaries  for  NGram_Text <=> NGram_ID ]
-        NGram2ID = {}
-        ID2NGram = {}
-        ngramid = 0
-        for i in FirstNgrams:
-            NGram2ID[i] = ngramid
-            ID2NGram[ngramid] = i
-            ngramid+=1
-        # *03* [ / making dictionaries  for  NGram_Text <=> NGram_ID ]
-        docs_X_terms = {}
-        for i in FreqList: # foreach ID in Doc:
-            docID = i[0]
-            associations = i[1]
-            # [ considering just {2,3}-grams ]
-            termsCount = 0
-            for ngram_text, weight in associations.items():
-                if ngram_text in NGram2ID: # considering just {2,3}-grams
-                    termsCount+=1
-            # [ / considering just {2,3}-grams ]
-            ngrams_by_document = termsCount # i re-calculed this because of *02*
-            terms = []
-            terms_occ = []
-            if ngrams_by_document > 0:
-                for ngram_text, weight in associations.items():
-                    if ngram_text in NGram2ID:
-                        terms.append(NGram2ID[ngram_text])
-                        # [ calculating TF-IDF ]
-                        occurrences_of_ngram = weight
-                        term_frequency = occurrences_of_ngram / ngrams_by_document
-                        xx = N
-                        yy = FirstNgrams[ngram_text]["C"]
-                        inverse_document_frequency= log(xx/yy) #log base e
-                        tfidfScore = term_frequency*inverse_document_frequency
-                        terms_occ.append( [ NGram2ID[ngram_text] ,  round(tfidfScore,3) ]  )
-                        # [ / calculating TF-IDF ]
-                        if "T" in FirstNgrams[ngram_text]:
-                            FirstNgrams[ngram_text]["T"].append(tfidfScore)
-                        else: 
-                            FirstNgrams[ngram_text]["T"] = [tfidfScore]
-                if len(terms)>1:
-                    docs_X_terms[docID] = terms_occ
-                    # print("docid:",docID)
-                    # for i in terms:
-                    #     print("\t",ID2NGram[i])
-                    calc.addCompleteSubGraph(terms)
-        return { "G":calc.G , "TERMS": ID2NGram , "ii":docs_X_terms ,"metrics":FirstNgrams }
-    def do_coocmatrix__MOV(self , TERMS , G , n=150 , type='node_link'):
-        import pandas as pd
-        from copy import copy
-        import numpy as np
-        import networkx as nx
-        from networkx.readwrite import json_graph
-        from gargantext_web.api import JsonHttpResponse
-        from analysis.louvain import best_partition
-        matrix = defaultdict(lambda : defaultdict(float))
-        ids    = dict()
-        labels = dict()
-        weight = dict()
-        print("PRINTING NUMBER OF NODES 01:",len(G))
-        for e in G.edges_iter():
-            n1 = e[0]
-            n2 = e[1]
-            w = G[n1][n2]['weight']
-            # print(n1," <=> ",n2, " : ", G[n1][n2]['weight'],"\t",TERMS[n1]," <=> ",TERMS[n2], "\t", G[n1][n2]['weight'])
-            ids[TERMS[n1]] = n1
-            ids[TERMS[n2]] = n2
-            labels[n1] = TERMS[n1]
-            labels[n2] = TERMS[n2]
-            matrix[ n1 ][ n2 ] = w
-            matrix[ n2 ][ n1 ] = w
-            weight[n2] = weight.get( n2, 0) + w
-            weight[n1] = weight.get( n1, 0) + w
-        df = pd.DataFrame(matrix).fillna(0)
-        x = copy(df.values)
-        x = x / x.sum(axis=1)
-        # Removing unconnected nodes
-        threshold = min(x.max(axis=1))
-        matrix_filtered = np.where(x >= threshold, 1, 0)
-        #matrix_filtered = np.where(x > threshold, x, 0)
-        #matrix_filtered = matrix_filtered.resize((90,90))
-        G = nx.from_numpy_matrix(matrix_filtered)
-        # G = nx.relabel_nodes(G, dict(enumerate([ labels[label] for label in list(df.columns)])))
-        partition = best_partition(G)
-        data = []
-        if type == "node_link":
-            for community in set(partition.values()):
-                G.add_node("cluster " + str(community), hidden=1)
-            for node in G.nodes():
-                try:
-                    G.node[node]['label']   = TERMS[node]
-                    G.node[node]['pk']      = node
-                    G.node[node]['size']    = weight[node]
-                    G.node[node]['group']   = partition[node]
-                    G.add_edge(node, "cluster " + str(partition[node]), weight=3)
-                except Exception as error:
-                    print("ERROR:",error)
-            print("PRINTING NUMBER OF NODES 02:",len(G))
-            data = json_graph.node_link_data(G)
-        elif type == "adjacency":
-            for node in G.nodes():
-                try:
-                    #node,type(labels[node])
-                    #G.node[node]['label']   = node
-                    G.node[node]['name']    = node
-                    #G.node[node]['size']    = weight[node]
-                    G.node[node]['group']   = partition[node]
-                    #G.add_edge(node, partition[node], weight=3)
-                except Exception as error:
-                    print(error)
-            data = json_graph.node_link_data(G)
-        return data
-    def workflow__MOV(self, keys=None, ngramsextractorscache=None, ngramscaches=None, verbose=False):
-        import time
-        total = 0
-        self.metadata['Processing'] = 1
-        self.save()
-        # # pwd = subprocess.Popen("cd /srv/gargantext/parsing/Taggers/nlpserver && pwd", stdout=subprocess.PIPE).stdout.read()
-        # # print(subprocess.Popen(['ls', '-lah'], stdout=subprocess.PIPE).communicate()[0].decode('utf-8'))
-        # print("activating nlpserver:")
-        # command = 'cd parsing/Taggers/nlpserver; python3 server.py'
-        # process = subprocess.Popen(command,stdout=subprocess.PIPE , stderr=subprocess.DEVNULL , shell=True)
-        print("LOG::TIME: In workflow()    parse_resources__MOV()")
-        start = time.time()
-        theMetadata = self.parse_resources__MOV()
-        end = time.time()
-        total += (end - start)
-        print ("LOG::TIME:_ "+datetime.datetime.now().isoformat()+" parse_resources()__MOV [s]",(end - start))
-        print("LOG::TIME: In workflow()    writeMetadata__MOV()")
-        start = time.time()
-        theMetadata = self.writeMetadata__MOV( metadata_list=theMetadata )
-        end = time.time()
-        total += (end - start)
-        print ("LOG::TIME:_ "+datetime.datetime.now().isoformat()+" writeMetadata__MOV() [s]",(end - start))
-        print("LOG::TIME: In workflow()    extract_ngrams__MOV()")
-        start = time.time()
-        FreqList = self.extract_ngrams__MOV(theMetadata , keys=['title'] )
-        end = time.time()
-        total += (end - start)
-        print ("LOG::TIME:_ "+datetime.datetime.now().isoformat()+" extract_ngrams__MOV() [s]",(end - start))
-        # process.kill()
-        # print("ok, process killed")
-        start = time.time()
-        print("LOG::TIME: In workflow()    do_tfidf()")
-        resultDict = self.do_tfidf__MOV( FreqList , theMetadata)
-        end = time.time()
-        total += (end - start)
-        print ("LOG::TIME:_ "+datetime.datetime.now().isoformat()+" do_tfidf() [s]",(end - start))
-        # # print("LOG::TIME: In workflow()    / do_tfidf()")
-        start = time.time()
-        print("LOG::TIME: In workflow()    do_coocmatrix()")
-        jsongraph = self.do_coocmatrix__MOV ( resultDict["TERMS"] , resultDict["G"] , n=150)
-        jsongraph["stats"] = resultDict["ii"]
-        end = time.time()
-        total += (end - start)
-        print ("LOG::TIME:_ "+datetime.datetime.now().isoformat()+" do_coocmatrix() [s]",(end - start))
-        print("the user:",self.user)
-        print("the project id:",self.parent.id)
-        print("the corpus id:",self.id)
-        # timestamp = str(datetime.datetime.now().isoformat())
-        # # filename = MEDIA_ROOT + '/corpora/%s/%s_%s__%s.json' % (self.user , self.parent.id, self.id , timestamp)
-        filename = MEDIA_ROOT + '/corpora/%s/%s_%s.json' % (self.user , self.parent.id, self.id)
-        import json
-        f = open(filename,"w")
-        f.write( json.dumps(jsongraph) )
-        f.close()
-        # # # this is not working
-        # # self.runInParallel( self.writeMetadata__MOV( metadata_list=theMetadata ) , self.extract_ngrams__MOV(theMetadata , keys=['title','abstract',] ) )
-        print("LOG::TIME:_ "+datetime.datetime.now().isoformat()+"   In workflow() END")
-        self.metadata['Processing'] = 0
-        self.save()
 class Node_Metadata(models.Model):
    node        = models.ForeignKey(Node, on_delete=models.CASCADE)

--- a/scrap_pubmed/views.py
+++ b/scrap_pubmed/views.py
@@ -73,75 +73,75 @@ def doTheQuery(request , project_id):
 	alist = ["hola","mundo"]
 	if request.method == "POST":
-		query = request.POST["query"]
+		# query = request.POST["query"]
-		name = request.POST["string"]
+		# name = request.POST["string"]
-		instancia = MedlineFetcher()
+		# instancia = MedlineFetcher()
-		thequeries = json.loads(query)
+		# thequeries = json.loads(query)
-		urlreqs = []
+		# urlreqs = []
-		for yearquery in thequeries:
+		# for yearquery in thequeries:
-			urlreqs.append( instancia.medlineEfetchRAW( yearquery ) )
+		# 	urlreqs.append( instancia.medlineEfetchRAW( yearquery ) )
-		alist = ["tudo fixe" , "tudo bem"]
+		# alist = ["tudo fixe" , "tudo bem"]
-		"""
+		# """
-		urlreqs: List of urls to query.
+		# urlreqs: List of urls to query.
-		- Then, to each url in urlreqs you do:
+		# - Then, to each url in urlreqs you do:
-			eFetchResult = urlopen(url)
+		# 	eFetchResult = urlopen(url)
-			eFetchResult.read()  # this will output the XML... normally you write this to a XML-file.
+		# 	eFetchResult.read()  # this will output the XML... normally you write this to a XML-file.
-		"""
+		# """
-		thefile = "how we do this here?"
+		# thefile = "how we do this here?"
-		resource_type = ResourceType.objects.get(name="pubmed" )
+		# resource_type = ResourceType.objects.get(name="pubmed" )
-		parent      = Node.objects.get(id=project_id)
+		# parent      = Node.objects.get(id=project_id)
-		node_type   = NodeType.objects.get(name='Corpus')
+		# node_type   = NodeType.objects.get(name='Corpus')
-		type_id = NodeType.objects.get(name='Document').id
+		# type_id = NodeType.objects.get(name='Document').id
-		user_id = User.objects.get( username=request.user ).id
+		# user_id = User.objects.get( username=request.user ).id
-		corpus = Node(
+		# corpus = Node(
-			user=request.user,
+		# 	user=request.user,
-			parent=parent,
+		# 	parent=parent,
-			type=node_type,
+		# 	type=node_type,
-			name=name,
+		# 	name=name,
-		)
+		# )
-		corpus.save()
+		# corpus.save()
-		tasks = MedlineFetcher()
+		# tasks = MedlineFetcher()
-		for i in range(8):
+		# for i in range(8):
-			t = threading.Thread(target=tasks.worker2) #thing to do
+		# 	t = threading.Thread(target=tasks.worker2) #thing to do
-			t.daemon = True  # thread dies when main thread (only non-daemon thread) exits.
+		# 	t.daemon = True  # thread dies when main thread (only non-daemon thread) exits.
-			t.start()
+		# 	t.start()
-		for url in urlreqs:
+		# for url in urlreqs:
-			filename = MEDIA_ROOT + '/corpora/%s/%s' % (request.user, str(datetime.datetime.now().isoformat()))
+		# 	filename = MEDIA_ROOT + '/corpora/%s/%s' % (request.user, str(datetime.datetime.now().isoformat()))
-			tasks.q.put( [url , filename]) #put a task in th queue
+		# 	tasks.q.put( [url , filename]) #put a task in th queue
-		tasks.q.join() # wait until everything is finished
+		# tasks.q.join() # wait until everything is finished
-		dwnldsOK = 0
+		# dwnldsOK = 0
-		for filename in tasks.firstResults:
+		# for filename in tasks.firstResults:
-			if filename!=False:
+		# 	if filename!=False:
-				corpus.add_resource( user=request.user, type=resource_type, file=filename )
+		# 		corpus.add_resource( user=request.user, type=resource_type, file=filename )
-				dwnldsOK+=1
+		# 		dwnldsOK+=1
-		if dwnldsOK == 0: return JsonHttpResponse(["fail"])
+		# if dwnldsOK == 0: return JsonHttpResponse(["fail"])
-		# do the WorkFlow
+		# # do the WorkFlow
-		try:
+		# try:
-			if DEBUG is True:
+		# 	if DEBUG is True:
-				# corpus.workflow() # old times...
+		# 		# corpus.workflow() # old times...
-				corpus.workflow__MOV()
+		# 		corpus.workflow__MOV()
-				# corpus.write_everything_to_DB()
+		# 		# corpus.write_everything_to_DB()
-			else:
+		# 	else:
-				# corpus.workflow.apply_async((), countdown=3)
+		# 		# corpus.workflow.apply_async((), countdown=3)
-				corpus.workflow__MOV().apply_async((), countdown=3) # synchronous! because is faaast
+		# 		corpus.workflow__MOV().apply_async((), countdown=3) # synchronous! because is faaast
-				# corpus.write_everything_to_DB.apply_async((), countdown=3) # asynchronous
+		# 		# corpus.write_everything_to_DB.apply_async((), countdown=3) # asynchronous
-			return JsonHttpResponse(["workflow","finished"])
+		# 	return JsonHttpResponse(["workflow","finished"])
-		except Exception as error:
+		# except Exception as error:
-			print(error)
+		# 	print(error)
-		return JsonHttpResponse(["workflow","finished","outside the try-except"])
+		return JsonHttpResponse(["out of service for the moment"])
 	data = alist
 	return JsonHttpResponse(data)
@@ -164,59 +164,59 @@ def testISTEX(request , project_id):
 		print(query_string , query , N)
-		urlreqs = []
+		# urlreqs = []
-		pagesize = 50
+		# pagesize = 50
-		tasks = MedlineFetcher()
+		# tasks = MedlineFetcher()
-		chunks = list(tasks.chunks(range(N), pagesize))
+		# chunks = list(tasks.chunks(range(N), pagesize))
-		for k in chunks:
+		# for k in chunks:
-			if (k[0]+pagesize)>N: pagesize = N-k[0]
+		# 	if (k[0]+pagesize)>N: pagesize = N-k[0]
-			urlreqs.append("http://api.istex.fr/document/?q="+query_string+"&output=*&"+"from="+str(k[0])+"&size="+str(pagesize))
+		# 	urlreqs.append("http://api.istex.fr/document/?q="+query_string+"&output=*&"+"from="+str(k[0])+"&size="+str(pagesize))
-		print(urlreqs)
+		# print(urlreqs)
-		urlreqs = ["http://localhost/374255" , "http://localhost/374278" ]
+		# urlreqs = ["http://localhost/374255" , "http://localhost/374278" ]
-		print(urlreqs)
+		# print(urlreqs)
-		resource_type = ResourceType.objects.get(name="istext" )
+		# resource_type = ResourceType.objects.get(name="istext" )
-		parent      = Node.objects.get(id=project_id)
+		# parent      = Node.objects.get(id=project_id)
-		node_type   = NodeType.objects.get(name='Corpus')
+		# node_type   = NodeType.objects.get(name='Corpus')
-		type_id = NodeType.objects.get(name='Document').id
+		# type_id = NodeType.objects.get(name='Document').id
-		user_id = User.objects.get( username=request.user ).id
+		# user_id = User.objects.get( username=request.user ).id
-		corpus = Node(
+		# corpus = Node(
-			user=request.user,
+		# 	user=request.user,
-			parent=parent,
+		# 	parent=parent,
-			type=node_type,
+		# 	type=node_type,
-			name=query,
+		# 	name=query,
-		)
+		# )
-		corpus.save()
+		# corpus.save()
-		# configuring your queue with the event
+		# # configuring your queue with the event
-		for i in range(8):
+		# for i in range(8):
-			t = threading.Thread(target=tasks.worker2) #thing to do
+		# 	t = threading.Thread(target=tasks.worker2) #thing to do
-			t.daemon = True  # thread dies when main thread (only non-daemon thread) exits.
+		# 	t.daemon = True  # thread dies when main thread (only non-daemon thread) exits.
-			t.start()
+		# 	t.start()
-		for url in urlreqs:
+		# for url in urlreqs:
-			filename = MEDIA_ROOT + '/corpora/%s/%s' % (request.user, str(datetime.now().microsecond))
+		# 	filename = MEDIA_ROOT + '/corpora/%s/%s' % (request.user, str(datetime.now().microsecond))
-			tasks.q.put( [url , filename]) #put a task in th queue
+		# 	tasks.q.put( [url , filename]) #put a task in th queue
-		tasks.q.join() # wait until everything is finished
+		# tasks.q.join() # wait until everything is finished
-		for filename in tasks.firstResults:
+		# for filename in tasks.firstResults:
-			corpus.add_resource( user=request.user, type=resource_type, file=filename )
+		# 	corpus.add_resource( user=request.user, type=resource_type, file=filename )
-		corpus.save()
+		# corpus.save()
-		print("DEBUG:",DEBUG)
+		# print("DEBUG:",DEBUG)
-		# do the WorkFlow
+		# # do the WorkFlow
-		try:
+		# try:
-			if DEBUG is True:
+		# 	if DEBUG is True:
-				corpus.workflow()
+		# 		corpus.workflow()
-			else:
+		# 	else:
-				corpus.workflow.apply_async((), countdown=3)
+		# 		corpus.workflow.apply_async((), countdown=3)
-			return JsonHttpResponse(["workflow","finished"])
+		# 	return JsonHttpResponse(["workflow","finished"])
-		except Exception as error:
+		# except Exception as error:
-			print(error)
+		# 	print(error)
 	data = [query_string,query,N]
 	return JsonHttpResponse(data)