Commit bd7294b2 authored by delanoe's avatar delanoe

Merge remote-tracking branch 'origin/romain-testing-taggers-parsers' into testing

parents b98b816d 08fed238
...@@ -7,21 +7,36 @@ class RISParser(Parser): ...@@ -7,21 +7,36 @@ class RISParser(Parser):
_begin = 6 _begin = 6
_parameters = { _parameters = {
"ER": {"type": "delimiter"}, # the record delimiter "ER": {"type": "delimiter"}, # the record delimiter
"TI": {"type": "hyperdata", "key": "title", "separator": " "}, "TI": {"type": "hyperdata", "key": "title", "separator": " "},
"T1": {"type": "hyperdata", "key": "title", "separator": " "},
# "T1": variant of TI (if together only last will be kept)
"ST": {"type": "hyperdata", "key": "subtitle", "separator": " "}, "ST": {"type": "hyperdata", "key": "subtitle", "separator": " "},
"AU": {"type": "hyperdata", "key": "authors", "separator": "\n"}, "AU": {"type": "hyperdata", "key": "authors", "separator": "\n"},
"JO": {"type": "hyperdata", "key": "journal"},
"T2": {"type": "hyperdata", "key": "journal"}, "T2": {"type": "hyperdata", "key": "journal"},
# "T2": variant of JO (if together only last will be kept)
"UR": {"type": "hyperdata", "key": "doi"}, "UR": {"type": "hyperdata", "key": "doi"},
# RIS format specifications: PY is not only year but YYYY/MM/DD with MM and DD optional # RIS format specifications: PY is not only year but YYYY/MM/DD with MM and DD optional
# cf. https://en.wikipedia.org/wiki/RIS_(file_format) # cf. https://en.wikipedia.org/wiki/RIS_(file_format)
"PY": {"type": "hyperdata", "key": "publication_year"}, "PY": {"type": "hyperdata", "key": "publication_year"},
"PD": {"type": "hyperdata", "key": "publication_month"},
"N1": {"type": "hyperdata", "key": "references", "separator": ", "}, "N1": {"type": "hyperdata", "key": "references", "separator": ", "}, # more like notes in reality
"LA": {"type": "hyperdata", "key": "language_iso2"}, "LA": {"type": "hyperdata", "key": "language_iso2"},
"AB": {"type": "hyperdata", "key": "abstract", "separator": " "}, "AB": {"type": "hyperdata", "key": "abstract", "separator": " "},
"WC": {"type": "hyperdata", "key": "fields"},
# TODO other interesting fields
# "KW" (keywords)
# "A1", "A2"... (variants of AU)
# "N2" (variant of AB)
# previously mentioned here but in fact not in RIS specifications
# "PD": {"type": "hyperdata", "key": "publication_month"},
# "WC": {"type": "hyperdata", "key": "fields"},
} }
def parse(self, file): def parse(self, file):
...@@ -33,49 +48,66 @@ class RISParser(Parser): ...@@ -33,49 +48,66 @@ class RISParser(Parser):
for line in file: for line in file:
# bytes ~~> str # bytes ~~> str
line = line.decode("UTF-8").rstrip('\r\n') line = line.decode("UTF-8").rstrip('\r\n')
# print("RIS line:", line)
if len(line) >= 2 : if len(line) >= 2 :
# extract the parameter key... # print("(nonemptyline)")
parameter_key = line[:2]
# test if key line (otherwise: continuation line)
# ...and keep the rest for when we know what to do with it if match(r'[A-Z][A-Z0-9]\s', line):
current_value = line[self._begin:] parameter_key = line[:2]
# print("(matchparamline:"+parameter_key+")")
# it's a new key => therefore the previous key is finished
if parameter_key != last_key: # we can now be sure that the value is rest of the line
# (keep it for when we know what to do with it)
if last_key in self._parameters: current_value = line[self._begin:]
# translate key
parameter = self._parameters[last_key] # it's a new key => therefore the previous key is finished
# 1 - we record the previous value array... if parameter_key != last_key:
if parameter["type"] == "hyperdata":
separator = parameter["separator"] if "separator" in parameter else "" if last_key in self._parameters:
final_value = separator.join(last_values) # translate key
if last_key != 'PY': parameter = self._parameters[last_key]
hyperdata[parameter["key"]] = final_value # 1 - we record the previous value array...
else: if parameter["type"] == "hyperdata":
hyperdata = PY_values_decompose_and_save(final_value, hyperdata) separator = parameter["separator"] if "separator" in parameter else ""
final_value = separator.join(last_values)
#... or even finish the record (rare here, most often after empty line) if last_key != 'PY':
elif parameter["type"] == "delimiter": hyperdata[parameter["key"]] = final_value
if 'language_fullname' not in hyperdata.keys(): else:
if 'language_iso3' not in hyperdata.keys(): hyperdata = PY_values_decompose_and_save(final_value, hyperdata)
if 'language_iso2' not in hyperdata.keys(): # print("{saved previous"+last_key+"}")
hyperdata['language_iso2'] = 'en'
yield hyperdata #... or even finish the record (rare here, most often after empty line)
last_key = None elif parameter["type"] == "delimiter":
hyperdata = {} if 'language_fullname' not in hyperdata.keys():
if 'language_iso3' not in hyperdata.keys():
# 2 - new key: also we start a new value array and move on to the next key if 'language_iso2' not in hyperdata.keys():
last_values = [] hyperdata['language_iso2'] = 'en'
last_key = parameter_key yield hyperdata
# print("{saved previous record}")
# 3 - new key or old: in any case we pass contents to last_key = None
hyperdata = {}
# 2 - new key: also we start a new value array and move on to the next key
last_values = []
last_key = parameter_key
# continuation line: values start from position 0
else:
current_value = line
# print("(continuationline)")
# 3 - new key or old or no key: in any case we pass contents to
# the value array buffer (=> for the next loop only) # the value array buffer (=> for the next loop only)
last_values.append(current_value) last_values.append(current_value)
current_value = None current_value = None
# empty line => we need to check if PREVIOUS LINE was record delimiter # empty line => we need to check if PREVIOUS LINE was record delimiter
else: else:
# print("(emptyline)")
if last_key in self._parameters: if last_key in self._parameters:
if parameter["type"] == "delimiter": if parameter["type"] == "delimiter":
if 'language_fullname' not in hyperdata.keys(): if 'language_fullname' not in hyperdata.keys():
...@@ -83,6 +115,7 @@ class RISParser(Parser): ...@@ -83,6 +115,7 @@ class RISParser(Parser):
if 'language_iso2' not in hyperdata.keys(): if 'language_iso2' not in hyperdata.keys():
hyperdata['language_iso2'] = 'en' hyperdata['language_iso2'] = 'en'
yield hyperdata yield hyperdata
# print("{saved previous record}")
last_key = None last_key = None
hyperdata = {} hyperdata = {}
# [end of loop per lines] # [end of loop per lines]
...@@ -97,6 +130,7 @@ class RISParser(Parser): ...@@ -97,6 +130,7 @@ class RISParser(Parser):
hyperdata[parameter["key"]] = final_value hyperdata[parameter["key"]] = final_value
else: else:
hyperdata = PY_values_decompose_and_save(final_value, hyperdata) hyperdata = PY_values_decompose_and_save(final_value, hyperdata)
# print("{saved previous"+last_key+"}")
# if a hyperdata object is left in memory, yield it as well # if a hyperdata object is left in memory, yield it as well
if hyperdata: if hyperdata:
...@@ -105,8 +139,7 @@ class RISParser(Parser): ...@@ -105,8 +139,7 @@ class RISParser(Parser):
if 'language_iso2' not in hyperdata.keys(): if 'language_iso2' not in hyperdata.keys():
hyperdata['language_iso2'] = 'en' hyperdata['language_iso2'] = 'en'
yield hyperdata yield hyperdata
# print("{saved previous record}")
......
...@@ -14,28 +14,9 @@ from gargantext.constants import NODETYPES ...@@ -14,28 +14,9 @@ from gargantext.constants import NODETYPES
from gargantext.util.db import session from gargantext.util.db import session
class RoutesChecker(TestCase): class RoutesChecker(TestCase):
@classmethod
def setUpClass(cls):
"""
Will be run *once* for all tests here
NEEDS TO HAVE TestCase.setUpClass()
"""
TestCase.setUpClass()
new_project = Node(
typename = 'PROJECT',
name = "hello i'm a project",
user_id = 1 # todo make sure it's the same user as login
)
session.add(new_project)
session.commit()
cls.a_node_id = new_project.id
print("created a project with id: %i" % new_project.id)
def setUp(self): def setUp(self):
""" """
Will be run before *each* test here Will be run before *each* test
""" """
self.client = Client() self.client = Client()
...@@ -46,6 +27,16 @@ class RoutesChecker(TestCase): ...@@ -46,6 +27,16 @@ class RoutesChecker(TestCase):
) )
# print(response.status_code) # expected: 302 FOUND # print(response.status_code) # expected: 302 FOUND
new_project = Node(
typename = 'PROJECT',
name = "hello i'm a project",
user_id = 1 # todo make sure it's the same user as login
)
session.add(new_project)
session.commit()
self.a_node_id = new_project.id
print("created a project with id: %i" % new_project.id)
def test_071a_get_front_page(self): def test_071a_get_front_page(self):
''' get the front page / ''' ''' get the front page / '''
front_response = self.client.get('/') front_response = self.client.get('/')
...@@ -78,7 +69,7 @@ class RoutesChecker(TestCase): ...@@ -78,7 +69,7 @@ class RoutesChecker(TestCase):
def test_073_get_api_one_node(self): def test_073_get_api_one_node(self):
''' get "api/nodes/<node_id>" ''' ''' get "api/nodes/<node_id>" '''
one_node_route = '/api/nodes/%i' % RoutesChecker.a_node_id one_node_route = '/api/nodes/%i' % self.a_node_id
# print("\ntesting node route: %s" % one_node_route) # print("\ntesting node route: %s" % one_node_route)
api_response = self.client.get(one_node_route) api_response = self.client.get(one_node_route)
self.assertTrue(api_response.has_header('Content-Type')) self.assertTrue(api_response.has_header('Content-Type'))
......
...@@ -11,19 +11,21 @@ from gargantext.util.db import session ...@@ -11,19 +11,21 @@ from gargantext.util.db import session
from gargantext.constants import RESOURCETYPES, NODETYPES, get_resource from gargantext.constants import RESOURCETYPES, NODETYPES, get_resource
from gargantext.util.toolchain.main import * from gargantext.util.toolchain.main import *
DATA_SAMPLE_DIR = "/srv/gargantext_lib/test_samples/" DATA_SAMPLE_DIR = "/srv/gargantext/unittests/mini_test_samples/"
# todo make it read NDOCS from a json overview to add in DATA_SAMPLE_DIR
DATA_SAMPLE_NDOCS = [ DATA_SAMPLE_NDOCS = [
None, # RESOURCETYPES None, # RESOURCETYPES
[50,4,50], # 1-europresse [7], # 1-europresse
[], # 2-jstor [], # 2-jstor
[81,81], # 3-pubmed [10], # 3-pubmed
[-1], # 4-scopus [], # 4-scopus
[-1], # 5-web_of_science [], # 5-web_of_science
[-1], # 6-zotero [12], # 6-zotero
[837,1000], #  7-csv [], #  7-csv
[-1], #  8-istex [32], #  8-istex
[3,10], # 9-scoap [], # 9-scoap
[-1], # 10-repec [], # 10-repec
] ]
...@@ -121,7 +123,7 @@ class ToolChainRecipes(TestCase): ...@@ -121,7 +123,7 @@ class ToolChainRecipes(TestCase):
self.log.debug("\t- Parsing and indexing corpus") self.log.debug("\t- Parsing and indexing corpus")
parse(self.corpus) parse(self.corpus)
real_ndocs = self.__count_node_children__(self.corpus, "DOCUMENT") real_ndocs = self.__count_node_children__(self.corpus, "DOCUMENT")
print('==>\t'+str(source_type)+'\t'+str(i)+'\t'+sample_file+'\t'+str(real_ndocs)) # print('==>\t'+str(source_type)+'\t'+str(i)+'\t'+sample_file+'\t'+str(real_ndocs))
self.assertEqual(real_ndocs, expected_ndocs) self.assertEqual(real_ndocs, expected_ndocs)
status = self.corpus.status() status = self.corpus.status()
self.log.debug("\t- Extracting ngrams") self.log.debug("\t- Extracting ngrams")
...@@ -137,29 +139,29 @@ class ToolChainRecipes(TestCase): ...@@ -137,29 +139,29 @@ class ToolChainRecipes(TestCase):
'''testing Europresse parsing''' '''testing Europresse parsing'''
self._run_recipe(1, DATA_SAMPLE_NDOCS[1]) self._run_recipe(1, DATA_SAMPLE_NDOCS[1])
def tests_002(self): # def tests_002_jstor(self):
self._run_recipe(2, DATA_SAMPLE_NDOCS[2]) # self._run_recipe(2, DATA_SAMPLE_NDOCS[2])
def tests_003(self): def tests_003_pubmed(self):
self._run_recipe(3, DATA_SAMPLE_NDOCS[3]) self._run_recipe(3, DATA_SAMPLE_NDOCS[3])
def tests_004(self): # def tests_004_scopus(self):
self._run_recipe(4, DATA_SAMPLE_NDOCS[4]) # self._run_recipe(4, DATA_SAMPLE_NDOCS[4])
#
def tests_005(self): # def tests_005_web_of_science(self):
self._run_recipe(5, DATA_SAMPLE_NDOCS[5]) # self._run_recipe(5, DATA_SAMPLE_NDOCS[5])
def tests_006(self): def tests_006_zotero(self):
self._run_recipe(6, DATA_SAMPLE_NDOCS[6]) self._run_recipe(6, DATA_SAMPLE_NDOCS[6])
def tests_007(self): # def tests_007_csv(self):
self._run_recipe(7, DATA_SAMPLE_NDOCS[7]) # self._run_recipe(7, DATA_SAMPLE_NDOCS[7])
def tests_008(self): def tests_008_istex(self):
self._run_recipe(8, DATA_SAMPLE_NDOCS[8]) self._run_recipe(8, DATA_SAMPLE_NDOCS[8])
def tests_009(self): # def tests_009_scoap(self):
self._run_recipe(9, DATA_SAMPLE_NDOCS[9]) # self._run_recipe(9, DATA_SAMPLE_NDOCS[9])
#
def tests_010(self): # def tests_010_repec(self):
self._run_recipe(10, DATA_SAMPLE_NDOCS[10]) # self._run_recipe(10, DATA_SAMPLE_NDOCS[10])
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment