Commit fc33b3c8 authored by sim's avatar sim

Trailing spaces

parent d75224b7
...@@ -58,9 +58,9 @@ def scan_gargantext(corpus_id, lang, request): ...@@ -58,9 +58,9 @@ def scan_gargantext(corpus_id, lang, request):
connection = get_engine().connect() connection = get_engine().connect()
# TODO add some sugar the request (ideally request should be the same for hal and garg) # TODO add some sugar the request (ideally request should be the same for hal and garg)
query = """select count(n.id) from nodes n query = """select count(n.id) from nodes n
where to_tsvector('%s', hyperdata ->> 'abstract' || 'title') where to_tsvector('%s', hyperdata ->> 'abstract' || 'title')
@@ to_tsquery('%s') @@ to_tsquery('%s')
AND n.parent_id = %s;""" % (lang, request, corpus_id) AND n.parent_id = %s;""" % (lang, request, corpus_id)
return [i for i in connection.execute(query)][0][0] return [i for i in connection.execute(query)][0][0]
connection.close() connection.close()
...@@ -76,28 +76,28 @@ def myProject_fromUrl(url): ...@@ -76,28 +76,28 @@ def myProject_fromUrl(url):
def newCorpus(project, resourceName=11, name="Machine learning", query="LSTM"): def newCorpus(project, resourceName=11, name="Machine learning", query="LSTM"):
print("Corpus \"%s\" in project \"%s\" created" % (name, project.name)) print("Corpus \"%s\" in project \"%s\" created" % (name, project.name))
corpus = project.add_child(name="Corpus name", typename='CORPUS') corpus = project.add_child(name="Corpus name", typename='CORPUS')
corpus.hyperdata["resources"] = [{"extracted" : "true", "type" : 11}] corpus.hyperdata["resources"] = [{"extracted" : "true", "type" : 11}]
corpus.hyperdata["statuses"] = [{"action" : "notebook", "complete" : "true"}] corpus.hyperdata["statuses"] = [{"action" : "notebook", "complete" : "true"}]
# [TODO] Add informations needed to get buttons on the Project view. # [TODO] Add informations needed to get buttons on the Project view.
session.add(corpus) session.add(corpus)
session.commit() session.commit()
hal = HalCrawler() hal = HalCrawler()
max_result = hal.scan_results(query) max_result = hal.scan_results(query)
paging = 100 paging = 100
for page in range(0, max_result, paging): for page in range(0, max_result, paging):
print("%s documents downloaded / %s." % (str( paging * (page +1)), str(max_result) )) print("%s documents downloaded / %s." % (str( paging * (page +1)), str(max_result) ))
docs = (hal._get(query, fromPage=page, count=paging) docs = (hal._get(query, fromPage=page, count=paging)
.get("response", {}) .get("response", {})
.get("docs", []) .get("docs", [])
) )
from gargantext.util.parsers.HAL import HalParser from gargantext.util.parsers.HAL import HalParser
# [TODO] fix boilerplate for docs here # [TODO] fix boilerplate for docs here
new_docs = HalParser(docs)._parse(docs) new_docs = HalParser(docs)._parse(docs)
for doc in new_docs: for doc in new_docs:
new_doc = (corpus.add_child( name = doc["title"][:255] new_doc = (corpus.add_child( name = doc["title"][:255]
, typename = 'DOCUMENT') , typename = 'DOCUMENT')
...@@ -105,12 +105,12 @@ def newCorpus(project, resourceName=11, name="Machine learning", query="LSTM"): ...@@ -105,12 +105,12 @@ def newCorpus(project, resourceName=11, name="Machine learning", query="LSTM"):
new_doc["hyperdata"] = doc new_doc["hyperdata"] = doc
session.add(new_doc) session.add(new_doc)
session.commit() session.commit()
print("Extracting the ngrams") print("Extracting the ngrams")
parse_extract_indexhyperdata(corpus) parse_extract_indexhyperdata(corpus)
print("Corpus is ready to explore:") print("Corpus is ready to explore:")
print("http://imt.gargantext.org/projects/%s/corpora/%s/" % (project.id, corpus.id)) print("http://imt.gargantext.org/projects/%s/corpora/%s/" % (project.id, corpus.id))
return corpus return corpus
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment