Commit bd7294b2 authored by delanoe's avatar delanoe

Merge remote-tracking branch 'origin/romain-testing-taggers-parsers' into testing

parents b98b816d 08fed238
......@@ -7,21 +7,36 @@ class RISParser(Parser):
_begin = 6
_parameters = {
"ER": {"type": "delimiter"}, # the record delimiter
"TI": {"type": "hyperdata", "key": "title", "separator": " "},
"T1": {"type": "hyperdata", "key": "title", "separator": " "},
# "T1": variant of TI (if together only last will be kept)
"ST": {"type": "hyperdata", "key": "subtitle", "separator": " "},
"AU": {"type": "hyperdata", "key": "authors", "separator": "\n"},
"JO": {"type": "hyperdata", "key": "journal"},
"T2": {"type": "hyperdata", "key": "journal"},
# "T2": variant of JO (if together only last will be kept)
"UR": {"type": "hyperdata", "key": "doi"},
# RIS format specifications: PY is not only year but YYYY/MM/DD with MM and DD optional
# cf. https://en.wikipedia.org/wiki/RIS_(file_format)
"PY": {"type": "hyperdata", "key": "publication_year"},
"PD": {"type": "hyperdata", "key": "publication_month"},
"N1": {"type": "hyperdata", "key": "references", "separator": ", "},
"N1": {"type": "hyperdata", "key": "references", "separator": ", "}, # more like notes in reality
"LA": {"type": "hyperdata", "key": "language_iso2"},
"AB": {"type": "hyperdata", "key": "abstract", "separator": " "},
"WC": {"type": "hyperdata", "key": "fields"},
# TODO other interesting fields
# "KW" (keywords)
# "A1", "A2"... (variants of AU)
# "N2" (variant of AB)
# previously mentioned here but in fact not in RIS specifications
# "PD": {"type": "hyperdata", "key": "publication_month"},
# "WC": {"type": "hyperdata", "key": "fields"},
}
def parse(self, file):
......@@ -33,49 +48,66 @@ class RISParser(Parser):
for line in file:
# bytes ~~> str
line = line.decode("UTF-8").rstrip('\r\n')
# print("RIS line:", line)
if len(line) >= 2 :
# extract the parameter key...
parameter_key = line[:2]
# ...and keep the rest for when we know what to do with it
current_value = line[self._begin:]
# it's a new key => therefore the previous key is finished
if parameter_key != last_key:
if last_key in self._parameters:
# translate key
parameter = self._parameters[last_key]
# 1 - we record the previous value array...
if parameter["type"] == "hyperdata":
separator = parameter["separator"] if "separator" in parameter else ""
final_value = separator.join(last_values)
if last_key != 'PY':
hyperdata[parameter["key"]] = final_value
else:
hyperdata = PY_values_decompose_and_save(final_value, hyperdata)
#... or even finish the record (rare here, most often after empty line)
elif parameter["type"] == "delimiter":
if 'language_fullname' not in hyperdata.keys():
if 'language_iso3' not in hyperdata.keys():
if 'language_iso2' not in hyperdata.keys():
hyperdata['language_iso2'] = 'en'
yield hyperdata
last_key = None
hyperdata = {}
# 2 - new key: also we start a new value array and move on to the next key
last_values = []
last_key = parameter_key
# 3 - new key or old: in any case we pass contents to
# print("(nonemptyline)")
# test if key line (otherwise: continuation line)
if match(r'[A-Z][A-Z0-9]\s', line):
parameter_key = line[:2]
# print("(matchparamline:"+parameter_key+")")
# we can now be sure that the value is rest of the line
# (keep it for when we know what to do with it)
current_value = line[self._begin:]
# it's a new key => therefore the previous key is finished
if parameter_key != last_key:
if last_key in self._parameters:
# translate key
parameter = self._parameters[last_key]
# 1 - we record the previous value array...
if parameter["type"] == "hyperdata":
separator = parameter["separator"] if "separator" in parameter else ""
final_value = separator.join(last_values)
if last_key != 'PY':
hyperdata[parameter["key"]] = final_value
else:
hyperdata = PY_values_decompose_and_save(final_value, hyperdata)
# print("{saved previous"+last_key+"}")
#... or even finish the record (rare here, most often after empty line)
elif parameter["type"] == "delimiter":
if 'language_fullname' not in hyperdata.keys():
if 'language_iso3' not in hyperdata.keys():
if 'language_iso2' not in hyperdata.keys():
hyperdata['language_iso2'] = 'en'
yield hyperdata
# print("{saved previous record}")
last_key = None
hyperdata = {}
# 2 - new key: also we start a new value array and move on to the next key
last_values = []
last_key = parameter_key
# continuation line: values start from position 0
else:
current_value = line
# print("(continuationline)")
# 3 - new key or old or no key: in any case we pass contents to
# the value array buffer (=> for the next loop only)
last_values.append(current_value)
current_value = None
# empty line => we need to check if PREVIOUS LINE was record delimiter
else:
# print("(emptyline)")
if last_key in self._parameters:
if parameter["type"] == "delimiter":
if 'language_fullname' not in hyperdata.keys():
......@@ -83,6 +115,7 @@ class RISParser(Parser):
if 'language_iso2' not in hyperdata.keys():
hyperdata['language_iso2'] = 'en'
yield hyperdata
# print("{saved previous record}")
last_key = None
hyperdata = {}
# [end of loop per lines]
......@@ -97,6 +130,7 @@ class RISParser(Parser):
hyperdata[parameter["key"]] = final_value
else:
hyperdata = PY_values_decompose_and_save(final_value, hyperdata)
# print("{saved previous"+last_key+"}")
# if a hyperdata object is left in memory, yield it as well
if hyperdata:
......@@ -105,8 +139,7 @@ class RISParser(Parser):
if 'language_iso2' not in hyperdata.keys():
hyperdata['language_iso2'] = 'en'
yield hyperdata
# print("{saved previous record}")
......
......@@ -14,28 +14,9 @@ from gargantext.constants import NODETYPES
from gargantext.util.db import session
class RoutesChecker(TestCase):
@classmethod
def setUpClass(cls):
"""
Will be run *once* for all tests here
NEEDS TO HAVE TestCase.setUpClass()
"""
TestCase.setUpClass()
new_project = Node(
typename = 'PROJECT',
name = "hello i'm a project",
user_id = 1 # todo make sure it's the same user as login
)
session.add(new_project)
session.commit()
cls.a_node_id = new_project.id
print("created a project with id: %i" % new_project.id)
def setUp(self):
"""
Will be run before *each* test here
Will be run before *each* test
"""
self.client = Client()
......@@ -46,6 +27,16 @@ class RoutesChecker(TestCase):
)
# print(response.status_code) # expected: 302 FOUND
new_project = Node(
typename = 'PROJECT',
name = "hello i'm a project",
user_id = 1 # todo make sure it's the same user as login
)
session.add(new_project)
session.commit()
self.a_node_id = new_project.id
print("created a project with id: %i" % new_project.id)
def test_071a_get_front_page(self):
''' get the front page / '''
front_response = self.client.get('/')
......@@ -78,7 +69,7 @@ class RoutesChecker(TestCase):
def test_073_get_api_one_node(self):
''' get "api/nodes/<node_id>" '''
one_node_route = '/api/nodes/%i' % RoutesChecker.a_node_id
one_node_route = '/api/nodes/%i' % self.a_node_id
# print("\ntesting node route: %s" % one_node_route)
api_response = self.client.get(one_node_route)
self.assertTrue(api_response.has_header('Content-Type'))
......
......@@ -11,19 +11,21 @@ from gargantext.util.db import session
from gargantext.constants import RESOURCETYPES, NODETYPES, get_resource
from gargantext.util.toolchain.main import *
DATA_SAMPLE_DIR = "/srv/gargantext_lib/test_samples/"
DATA_SAMPLE_DIR = "/srv/gargantext/unittests/mini_test_samples/"
# todo make it read NDOCS from a json overview to add in DATA_SAMPLE_DIR
DATA_SAMPLE_NDOCS = [
None, # RESOURCETYPES
[50,4,50], # 1-europresse
[7], # 1-europresse
[], # 2-jstor
[81,81], # 3-pubmed
[-1], # 4-scopus
[-1], # 5-web_of_science
[-1], # 6-zotero
[837,1000], #  7-csv
[-1], #  8-istex
[3,10], # 9-scoap
[-1], # 10-repec
[10], # 3-pubmed
[], # 4-scopus
[], # 5-web_of_science
[12], # 6-zotero
[], #  7-csv
[32], #  8-istex
[], # 9-scoap
[], # 10-repec
]
......@@ -121,7 +123,7 @@ class ToolChainRecipes(TestCase):
self.log.debug("\t- Parsing and indexing corpus")
parse(self.corpus)
real_ndocs = self.__count_node_children__(self.corpus, "DOCUMENT")
print('==>\t'+str(source_type)+'\t'+str(i)+'\t'+sample_file+'\t'+str(real_ndocs))
# print('==>\t'+str(source_type)+'\t'+str(i)+'\t'+sample_file+'\t'+str(real_ndocs))
self.assertEqual(real_ndocs, expected_ndocs)
status = self.corpus.status()
self.log.debug("\t- Extracting ngrams")
......@@ -137,29 +139,29 @@ class ToolChainRecipes(TestCase):
'''testing Europresse parsing'''
self._run_recipe(1, DATA_SAMPLE_NDOCS[1])
def tests_002(self):
self._run_recipe(2, DATA_SAMPLE_NDOCS[2])
# def tests_002_jstor(self):
# self._run_recipe(2, DATA_SAMPLE_NDOCS[2])
def tests_003(self):
def tests_003_pubmed(self):
self._run_recipe(3, DATA_SAMPLE_NDOCS[3])
def tests_004(self):
self._run_recipe(4, DATA_SAMPLE_NDOCS[4])
def tests_005(self):
self._run_recipe(5, DATA_SAMPLE_NDOCS[5])
# def tests_004_scopus(self):
# self._run_recipe(4, DATA_SAMPLE_NDOCS[4])
#
# def tests_005_web_of_science(self):
# self._run_recipe(5, DATA_SAMPLE_NDOCS[5])
def tests_006(self):
def tests_006_zotero(self):
self._run_recipe(6, DATA_SAMPLE_NDOCS[6])
def tests_007(self):
self._run_recipe(7, DATA_SAMPLE_NDOCS[7])
# def tests_007_csv(self):
# self._run_recipe(7, DATA_SAMPLE_NDOCS[7])
def tests_008(self):
def tests_008_istex(self):
self._run_recipe(8, DATA_SAMPLE_NDOCS[8])
def tests_009(self):
self._run_recipe(9, DATA_SAMPLE_NDOCS[9])
def tests_010(self):
self._run_recipe(10, DATA_SAMPLE_NDOCS[10])
# def tests_009_scoap(self):
# self._run_recipe(9, DATA_SAMPLE_NDOCS[9])
#
# def tests_010_repec(self):
# self._run_recipe(10, DATA_SAMPLE_NDOCS[10])
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment