Commit 851b3afb authored by Romain Loth's avatar Romain Loth

unittests: factorize (same tests for different resources) + start adding array...

unittests: factorize (same tests for different resources) + start adding array of expected ndocs per test samples
parent 36712d23
......@@ -14,7 +14,21 @@ from gargantext.constants import RESOURCETYPES, NODETYPES, get_resource
#from unittests.framework import GargTestRunner
from gargantext.util.toolchain.main import *
DATA_SAMPLE_DIR = "/srv/gargantext_lib/data_samples/"
DATA_SAMPLE_DIR = "/srv/gargantext_lib/test_samples/"
DATA_SAMPLE_NDOCS = [
None, # RESOURCETYPES
[50,4,50], # 1-europresse
[], # 2-jstor
[81,81], # 3-pubmed
[-1], # 4-scopus
[-1], # 5-web_of_science
[-1], # 6-zotero
[837,1000], #  7-csv
[-1], #  8-istex
[3,10], # 9-scoap
[-1], # 10-repec
]
class ToolChainRecipes(TestCase):
......@@ -47,8 +61,6 @@ class ToolChainRecipes(TestCase):
self.session.commit()
return self.project
def __count_node_children__(self, CurrNode, typename=None):
'''find ALL the children of a given Node [optionnal filter TYPENAME] '''
if typename is None:
......@@ -89,315 +101,76 @@ class ToolChainRecipes(TestCase):
def _get_corpus(self, name):
corpus = self.session.query(Node).filter(Node.typename == "CORPUS", Node.name == name).first()
return corpus
def test_000_post(self):
self.client.get("/projects/%i" %self.project.id)
def tests_001_europresse(self):
'''testing Europresse parsing'''
#create a project
self.test_name = ">> "+ sys._getframe().f_code.co_name +":"
print("tests_001_europresse, name:", self.test_name)
self.log.debug("\n" + self.test_name)
source_type = 1
def _run_recipe(self, source_type, expected_results):
"""
Each of the resources input test can follow this common recipe base
for i,sample_file in enumerate(self.sample_files["europresse"]):
name = "testEuropress_"+str(i)
self.log.debug("\t- Checking creation of corpus %s" %name)
self.corpus = self._create_corpus(name, source_type, sample_file)
db_corpus = self._get_corpus(name)
#corpus check
self.assertEqual(self.corpus.name, db_corpus.name)
self.log.debug("\t- Checking creation of resource type '%s' " %get_resource(source_type)["name"])
self.assertEqual(self.corpus.resources()[0]["type"], db_corpus.resources()[0]["type"])
self.log.debug("\t- Parsing and indexing corpus")
parse(self.corpus)
docs = self.__count_node_children__(self.corpus, "DOCUMENT")
self.assertEqual(docs, 4)
status = self.corpus.status()
self.assertTrue(status["complete"])
self.log.debug("\t- Extracting ngrams")
extract_ngrams(self.corpus)
ngrams = self.__count_node_children__(self.corpus, "NGRAMS")
status = self.corpus.status()
self.assertTrue(status["complete"])
def tests_002(self):
#create a project
source_type = 2
source = get_resource(2)
@param source_type: int (cf. constants.py RESOURCETYPES)
@param expected_results: []int (number of docs for each sample corpora of this source)
"""
source = get_resource(source_type)
source_name = source["name"].split("[")[0].lower().strip().replace(" ", "_")
self.test_name = ">> "+ sys._getframe().f_code.co_name +"_"+str(source_name)+":"
self.log.debug("\n" + self.test_name)
for i,sample_file in enumerate(self.sample_files[source_name]):
print("... sample_file:", sample_file)
# expected_ndocs = expected_results[i]
name = "test_"+source_name+str(i)
self.log.debug("\t- Checking creation of corpus %s" %name)
self.corpus = self._create_corpus(name, source_type, sample_file)
db_corpus = self._get_corpus(name)
#corpus check
self.assertEqual(self.corpus.name, db_corpus.name)
self.log.debug("\t- Checking creation of resource type '%s' " %get_resource(source_type)["name"])
self.assertEqual(self.corpus.resources()[0]["type"], db_corpus.resources()[0]["type"])
self.log.debug("\t- Parsing and indexing corpus")
parse(self.corpus)
docs = self.__count_node_children__(self.corpus, "DOCUMENT")
self.assertEqual(docs, 50)
real_ndocs = self.__count_node_children__(self.corpus, "DOCUMENT")
print('==>\t'+str(source_type)+'\t'+str(i)+'\t'+sample_file+'\t'+str(real_ndocs))
self.assertEqual(real_ndocs, expected_number_of_docs)
status = self.corpus.status()
self.assertTrue(status["complete"])
self.log.debug("\t- Extracting ngrams")
extract_ngrams(self.corpus)
ngrams = self.__count_node_children__(self.corpus, "NGRAMS")
# ngrams = self.__count_node_children__(self.corpus, "NGRAMS")
status = self.corpus.status()
self.assertTrue(status["complete"])
def tests_003(self):
#create a project
source_type = 3
source = get_resource(3)
source_name = source["name"].split("[")[0].lower().strip().replace(" ", "_")
self.test_name = ">> "+ sys._getframe().f_code.co_name +"_"+str(source_name)+":"
self.log.debug("\n" + self.test_name)
def test_000_get_project(self):
self.client.get("/projects/%i" %self.project.id)
for i,sample_file in enumerate(self.sample_files[source_name]):
name = "test_"+source_name+str(i)
self.log.debug("\t- Checking creation of corpus %s" %name)
self.corpus = self._create_corpus(name, source_type, sample_file)
db_corpus = self._get_corpus(name)
#corpus check
self.assertEqual(self.corpus.name, db_corpus.name)
self.log.debug("\t- Checking creation of resource type '%s' " %get_resource(source_type)["name"])
self.assertEqual(self.corpus.resources()[0]["type"], db_corpus.resources()[0]["type"])
self.log.debug("\t- Parsing and indexing corpus")
parse(self.corpus)
docs = self.__count_node_children__(self.corpus, "DOCUMENT")
self.assertEqual(docs, 81)
status = self.corpus.status()
self.assertTrue(status["complete"])
self.log.debug("\t- Extracting ngrams")
extract_ngrams(self.corpus)
ngrams = self.__count_node_children__(self.corpus, "NGRAMS")
status = self.corpus.status()
self.assertTrue(status["complete"])
def tests_004(self):
#create a project
source_type = 4
source = get_resource(4)
source_name = source["name"].split("[")[0].lower().strip().replace(" ", "_")
self.test_name = ">> "+ sys._getframe().f_code.co_name +"_"+str(source_name)+":"
self.log.debug("\n" + self.test_name)
def tests_001_europresse(self):
'''testing Europresse parsing'''
self._run_recipe(1, DATA_SAMPLE_NDOCS[1])
def tests_002(self):
self._run_recipe(2, DATA_SAMPLE_NDOCS[2])
def tests_003(self):
self._run_recipe(3, DATA_SAMPLE_NDOCS[3])
for i,sample_file in enumerate(self.sample_files[source_name]):
name = "test_"+source_name+str(i)
self.log.debug("\t- Checking creation of corpus %s" %name)
self.corpus = self._create_corpus(name, source_type, sample_file)
db_corpus = self._get_corpus(name)
#corpus check
def tests_004(self):
self._run_recipe(4, DATA_SAMPLE_NDOCS[4])
self.assertEqual(self.corpus.name, db_corpus.name)
self.log.debug("\t- Checking creation of resource type '%s' " %get_resource(source_type)["name"])
self.assertEqual(self.corpus.resources()[0]["type"], db_corpus.resources()[0]["type"])
self.log.debug("\t- Parsing and indexing corpus")
parse(self.corpus)
docs = self.__count_node_children__(self.corpus, "DOCUMENT")
self.assertEqual(docs, 50)
status = self.corpus.status()
self.assertTrue(status["complete"])
self.log.debug("\t- Extracting ngrams")
extract_ngrams(self.corpus)
ngrams = self.__count_node_children__(self.corpus, "NGRAMS")
status = self.corpus.status()
self.assertTrue(status["complete"])
def tests_005(self):
#create a project
source_type = 5
source = get_resource(5)
source_name = source["name"].split("[")[0].lower().strip().replace(" ", "_")
self.test_name = ">> "+ sys._getframe().f_code.co_name +"_"+str(source_name)+":"
self.log.debug("\n" + self.test_name)
self._run_recipe(5, DATA_SAMPLE_NDOCS[5])
for i,sample_file in enumerate(self.sample_files[source_name]):
name = "test_"+source_name+str(i)
self.log.debug("\t- Checking creation of corpus %s" %name)
self.corpus = self._create_corpus(name, source_type, sample_file)
db_corpus = self._get_corpus(name)
#corpus check
self.assertEqual(self.corpus.name, db_corpus.name)
self.log.debug("\t- Checking creation of resource type '%s' " %get_resource(source_type)["name"])
self.assertEqual(self.corpus.resources()[0]["type"], db_corpus.resources()[0]["type"])
self.log.debug("\t- Parsing and indexing corpus")
parse(self.corpus)
docs = self.__count_node_children__(self.corpus, "DOCUMENT")
self.assertEqual(docs, 50)
status = self.corpus.status()
self.assertTrue(status["complete"])
self.log.debug("\t- Extracting ngrams")
extract_ngrams(self.corpus)
ngrams = self.__count_node_children__(self.corpus, "NGRAMS")
status = self.corpus.status()
self.assertTrue(status["complete"])
def tests_006(self):
#create a project
source_type = 6
source = get_resource(6)
source_name = source["name"].split("[")[0].lower().strip().replace(" ", "_")
self.test_name = ">> "+ sys._getframe().f_code.co_name +"_"+str(source_name)+":"
self.log.debug("\n" + self.test_name)
self._run_recipe(6, DATA_SAMPLE_NDOCS[6])
for i,sample_file in enumerate(self.sample_files[source_name]):
name = "test_"+source_name+str(i)
self.log.debug("\t- Checking creation of corpus %s" %name)
self.corpus = self._create_corpus(name, source_type, sample_file)
db_corpus = self._get_corpus(name)
#corpus check
self.assertEqual(self.corpus.name, db_corpus.name)
self.log.debug("\t- Checking creation of resource type '%s' " %get_resource(source_type)["name"])
self.assertEqual(self.corpus.resources()[0]["type"], db_corpus.resources()[0]["type"])
self.log.debug("\t- Parsing and indexing corpus")
parse(self.corpus)
docs = self.__count_node_children__(self.corpus, "DOCUMENT")
self.assertEqual(docs, 50)
status = self.corpus.status()
self.assertTrue(status["complete"])
self.log.debug("\t- Extracting ngrams")
extract_ngrams(self.corpus)
ngrams = self.__count_node_children__(self.corpus, "NGRAMS")
status = self.corpus.status()
self.assertTrue(status["complete"])
def tests_007(self):
#create a project
source_type = 7
source = get_resource(7)
source_name = source["name"].split("[")[0].lower().strip().replace(" ", "_")
self.test_name = ">> "+ sys._getframe().f_code.co_name +"_"+str(source_name)+":"
self.log.debug("\n" + self.test_name)
self._run_recipe(7, DATA_SAMPLE_NDOCS[7])
for i,sample_file in enumerate(self.sample_files[source_name]):
name = "test_"+source_name+str(i)
self.log.debug("\t- Checking creation of corpus %s" %name)
self.corpus = self._create_corpus(name, source_type, sample_file)
db_corpus = self._get_corpus(name)
#corpus check
self.assertEqual(self.corpus.name, db_corpus.name)
self.log.debug("\t- Checking creation of resource type '%s' " %get_resource(source_type)["name"])
self.assertEqual(self.corpus.resources()[0]["type"], db_corpus.resources()[0]["type"])
self.log.debug("\t- Parsing and indexing corpus")
parse(self.corpus)
docs = self.__count_node_children__(self.corpus, "DOCUMENT")
self.assertEqual(docs, 837)
status = self.corpus.status()
self.assertTrue(status["complete"])
self.log.debug("\t- Extracting ngrams")
extract_ngrams(self.corpus)
ngrams = self.__count_node_children__(self.corpus, "NGRAMS")
status = self.corpus.status()
self.assertTrue(status["complete"])
def tests_008(self):
#create a project
source_type = 8
source = get_resource(8)
source_name = source["name"].split("[")[0].lower().strip().replace(" ", "_")
self.test_name = ">> "+ sys._getframe().f_code.co_name +"_"+str(source_name)+":"
self.log.debug("\n" + self.test_name)
for i,sample_file in enumerate(self.sample_files[source_name]):
name = "test_"+source_name+str(i)
self.log.debug("\t- Checking creation of corpus %s" %name)
self.corpus = self._create_corpus(name, source_type, sample_file)
db_corpus = self._get_corpus(name)
#corpus check
self._run_recipe(8, DATA_SAMPLE_NDOCS[8])
self.assertEqual(self.corpus.name, db_corpus.name)
self.log.debug("\t- Checking creation of resource type '%s' " %get_resource(source_type)["name"])
self.assertEqual(self.corpus.resources()[0]["type"], db_corpus.resources()[0]["type"])
self.log.debug("\t- Parsing and indexing corpus")
parse(self.corpus)
docs = self.__count_node_children__(self.corpus, "DOCUMENT")
self.assertEqual(docs, 50)
status = self.corpus.status()
self.assertTrue(status["complete"])
self.log.debug("\t- Extracting ngrams")
extract_ngrams(self.corpus)
ngrams = self.__count_node_children__(self.corpus, "NGRAMS")
status = self.corpus.status()
self.assertTrue(status["complete"])
def tests_009(self):
#create a project
source_type = 9
source = get_resource(9)
source_name = source["name"].split("[")[0].lower().strip().replace(" ", "_")
self.test_name = ">> "+ sys._getframe().f_code.co_name +"_"+str(source_name)+":"
self.log.debug("\n" + self.test_name)
for i,sample_file in enumerate(self.sample_files[source_name]):
name = "test_"+source_name+str(i)
self.log.debug("\t- Checking creation of corpus %s" %name)
self.corpus = self._create_corpus(name, source_type, sample_file)
db_corpus = self._get_corpus(name)
#corpus check
self._run_recipe(9, DATA_SAMPLE_NDOCS[9])
self.assertEqual(self.corpus.name, db_corpus.name)
self.log.debug("\t- Checking creation of resource type '%s' " %get_resource(source_type)["name"])
self.assertEqual(self.corpus.resources()[0]["type"], db_corpus.resources()[0]["type"])
self.log.debug("\t- Parsing and indexing corpus")
parse(self.corpus)
docs = self.__count_node_children__(self.corpus, "DOCUMENT")
self.assertEqual(docs, 10)
status = self.corpus.status()
self.assertTrue(status["complete"])
self.log.debug("\t- Extracting ngrams")
extract_ngrams(self.corpus)
ngrams = self.__count_node_children__(self.corpus, "NGRAMS")
status = self.corpus.status()
self.assertTrue(status["complete"])
def tests_010(self):
#create a project
source_type = 10
source = get_resource(10)
self.log.debug(source)
source_name = source["name"].split("[")[0].lower().strip().replace(" ", "_")
self.log.debug(source_name)
self.test_name = ">> "+ sys._getframe().f_code.co_name +"_"+str(source_name)+":"
self.log.debug("\n" + self.test_name)
for i,sample_file in enumerate(self.sample_files[source_name]):
name = "test_"+source_name+str(i)
self.log.debug("\t- Checking creation of corpus %s" %name)
self.corpus = self._create_corpus(name, source_type, sample_file)
db_corpus = self._get_corpus(name)
#corpus check
self.assertEqual(self.corpus.name, db_corpus.name)
self.log.debug("\t- Checking creation of resource type '%s' " %get_resource(source_type)["name"])
self.assertEqual(self.corpus.resources()[0]["type"], db_corpus.resources()[0]["type"])
self.log.debug("\t- Parsing and indexing corpus")
parse(self.corpus)
docs = self.__count_node_children__(self.corpus, "DOCUMENT")
self.assertEqual(docs, 50)
status = self.corpus.status()
self.assertTrue(status["complete"])
self.log.debug("\t- Extracting ngrams")
extract_ngrams(self.corpus)
ngrams = self.__count_node_children__(self.corpus, "NGRAMS")
status = self.corpus.status()
self.assertTrue(status["complete"])
self._run_recipe(10, DATA_SAMPLE_NDOCS[10])
if __name__ == "__main__":
logging.basicConfig( stream=sys.stderr )
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment