add conditions for method POST for validation of TSV contexts along with tests and OAS update

cfc1e7cb · Marie FU · 313b7fcd · cfc1e7cb · cfc1e7cb · cfc1e7cb
Commit cfc1e7cb authored Jan 27, 2025 by Marie FU
10 changed files
--- a/gargantools/src/contexts.py
+++ b/gargantools/src/contexts.py
 import json
 from flask import Blueprint, request

-from gargantools.utils.utils import check_fileContent, check_fileExtension, check_fileEncoding
+from gargantools.utils.contexts_utils import check_fileContent
+from gargantools.utils.utils import check_columnName, check_fileExtension, check_fileEncoding, get_fileContent, get_fileDelimiter


-bp = Blueprint("contexts", __name__, url_prefix="/contexts")
+COLUMN_NAMES = ["Publication Day", "Publication Month", "Publication Year", "Authors", "Title", "Abstract", "Source"]

+bp = Blueprint("contexts", __name__, url_prefix="/contexts")

 @bp.get('')
 def tsvTemplate():
    return json.dumps({"Publication Day": 1, "Publication Month": 1, "Publication Year": 1, "Authors": "Some authors", "Title": "A title", "Abstract": "An abstract", "Source": "Some Source" }, indent=3)

 @bp.post('')
-def tsvValidate():
+def tsvValidation():
    if 'file' not in request.files:
-        print(request.files)
        return "Bad request, missing file\n", 400
    else:
        file = request.files['file']
@@ -24,13 +25,25 @@ def tsvValidate():
            file_encoding = check_fileEncoding(file)
            if file_encoding is None:
                return "Could not read the file\n", 422
+            elif get_fileDelimiter(file, file_encoding) != '\t':
+                return "File delimiter not found or Incorrect file delimiter, should be a tabulation\n", 422
            else:
-                state, problems = check_fileContent(file, file_encoding)
-                if state:
-                    return "Correct file\n", 200
+                file_content = get_fileContent(file, '\t')
+                if file_content is None:
+                    print('here')
+                    return "Could not read the file\n", 422
                else:
-                    if problems is not None:
-                        return 'Incorrect file - File is not compatible with GarganText\n', 422
+                    header = file_content.keys()
+                    state, notFoundColumn = check_columnName(header, COLUMN_NAMES)
+                    if not state:
+                        return f"Some column names were not found {','.join(notFoundColumn)}\n", 422
                    else:
-                        return 'Incorrect file - File is malformed\n', 422
-        
\ No newline at end of file
+                        state, problems = check_fileContent(file_content)
+                        if state:
+                            return "Correct file\n", 200
+                        else:
+                            if problems is not None:
+                                return 'Incorrect file - File is not compatible with GarganText\n', 422
+                            else:
+                                return 'Unexpected error in file\n', 422
+                
\ No newline at end of file
--- a/gargantools/src/terms.py
+++ b/gargantools/src/terms.py
 import json
-from flask import Blueprint
+from flask import Blueprint, request
+
+from gargantools.utils.utils import check_fileEncoding, check_fileExtension


 bp = Blueprint("terms", __name__, url_prefix="/terms")
@@ -7,3 +9,16 @@ bp = Blueprint("terms", __name__, url_prefix="/terms")
 @bp.get('')
 def termsTemplate():
    return json.dumps({"status": "MapTerm", "label": "A term"}, indent=3)
+
+@bp.post('')
+def termsValidation():
+    if 'file' not in request.files:
+        return "Bad request, missing file\n", 400
+    else:
+        file = request.files['file']
+        if not (file and check_fileExtension(file.filename, {"csv", "tsv"})):
+            return "Incorrect file format or file format not found\n", 400
+        else:
+            file_encoding = check_fileEncoding(file)
+            if file_encoding is None:
+                return "Could not read the file\n", 422
\ No newline at end of file
--- a/gargantools/utils/contexts_utils.py
+++ b/gargantools/utils/contexts_utils.py
+import csv
+import pandas as pd
+import petl
+
+from gargantools.utils.utils import check_date, check_unacceptedCharactersQuote, check_unacceptedCharactersTab
+
+
+def check_fileContent(fileContent):
+    header = ("Publication Day", "Publication Month", "Publication Year", "Authors", "Title", "Abstract", "Source")
+    constraints = [
+        dict(
+            name = "pub_day", field = "Publication Day", test = int, assertion = lambda x: 0 < x <= 31
+        ),
+        dict(
+            name = "pub_month", field = "Publication Month", test = int, assertion = lambda x : 0 < x <= 12
+        ),
+        dict(
+            name = "pub_year", field = "Publication Year", test = int, assertion = lambda x: x > 0
+        ),
+        dict(
+            name = "date_format", field = ["Publication Day", "Publication Month", "Publication Year"], assertion = lambda row : check_date(row[2], row[1], row[0])
+        ),
+        dict(
+            name = "str_formatQuote", field = "Authors", assertion = lambda x : check_unacceptedCharactersQuote(x)
+        ),
+        dict(
+            name = "str_formatTab", field = "Authors", assertion = lambda x : check_unacceptedCharactersTab(x)
+        ),
+        dict(
+            name = "str_formatQuote", field = "Title", assertion = lambda x : check_unacceptedCharactersQuote(x)
+        ),
+        dict(
+            name = "str_formatTab", field = "Title", assertion = lambda x : check_unacceptedCharactersTab(x)
+        ),
+        dict(
+            name = "str_formatQuote", field = "Abstract", assertion = lambda x : check_unacceptedCharactersQuote(x)
+        ),
+        dict(
+            name = "str_formatTab", field = "Abstract", assertion = lambda x : check_unacceptedCharactersTab(x)
+        ),
+        dict(
+            name = "str_formatQuote", field = "Source", assertion = lambda x : check_unacceptedCharactersQuote(x)
+        ),
+        dict(
+            name = "str_formatTab", field = "Source", assertion = lambda x : check_unacceptedCharactersTab(x)
+        )
+    ]
+    
+    dataTable = fileContent.values.tolist()
+    dataTable.insert(0, fileContent.columns.to_list())
+
+    problemCells = petl.validate(dataTable, constraints, header)
+
+    if problemCells.len() > 1:
+        return False, problemCells
+    else:
+        return True, None
--- a/gargantools/utils/utils.py
+++ b/gargantools/utils/utils.py
 import csv
 import datetime
 import chardet
-import petl as etl
 import pandas as pd

-
 ALLOWED_ENCODING = {"utf-8", "utf-8-sig", "ascii"}


@@ -37,17 +35,30 @@ def check_fileEncoding(file):
 def get_fileName(filename):
    return filename.rsplit('.', 1)[:-1][0]

-def check_columnName(list_columnNames):
+def get_fileDelimiter(file, file_encoding):
+    fileDelimiter = csv.Sniffer().sniff(file.read().decode(file_encoding)).delimiter
+    file.seek(0)
+
+    return fileDelimiter
+
+def get_fileContent(file, fileDelimiter):
+    try:
+        fileContent = pd.read_csv(file, sep=fileDelimiter)
+    except Exception:
+        return None
+    
+    return fileContent
+
+def check_columnName(list_columnNames, correct_columnNames):
    if len(list_columnNames) > 7:
        return False, []
    else:
-        correct_columnNames = ["Publication Day", "Publication Month", "Publication Year", "Authors", "Title", "Abstract", "Source"]
-        notFound_columnNames = correct_columnNames
+        notFound_columnNames = correct_columnNames.copy()

        for colummnName in list_columnNames:
            if colummnName in correct_columnNames:
                notFound_columnNames.remove(colummnName)
-        
+                
        if len(notFound_columnNames) != 0:
            return False, notFound_columnNames
        else:
@@ -65,63 +76,4 @@ def check_unacceptedCharactersQuote(cell):

 def check_unacceptedCharactersTab(cell):
    return ('	' in cell) == False
-
-def check_fileContent(file, file_encoding):
-    header = ("Publication Day", "Publication Month", "Publication Year", "Authors", "Title", "Abstract", "Source")
-    constraints = [
-        dict(
-            name = "pub_day", field = "Publication Day", test = int, assertion = lambda x: 0 < x <= 31
-        ),
-        dict(
-            name = "pub_month", field = "Publication Month", test = int, assertion = lambda x : 0 < x <= 12
-        ),
-        dict(
-            name = "pub_year", field = "Publication Year", test = int, assertion = lambda x: x > 0
-        ),
-        dict(
-            name = "date_format", field = ["Publication Day", "Publication Month", "Publication Year"], assertion = lambda row : check_date(row[2], row[1], row[0])
-        ),
-        dict(
-            name = "str_formatQuote", field = "Authors", assertion = lambda x : check_unacceptedCharactersQuote(x)
-        ),
-        dict(
-            name = "str_formatTab", field = "Authors", assertion = lambda x : check_unacceptedCharactersTab(x)
-        ),
-        dict(
-            name = "str_formatQuote", field = "Title", assertion = lambda x : check_unacceptedCharactersQuote(x)
-        ),
-        dict(
-            name = "str_formatTab", field = "Title", assertion = lambda x : check_unacceptedCharactersTab(x)
-        ),
-        dict(
-            name = "str_formatQuote", field = "Abstract", assertion = lambda x : check_unacceptedCharactersQuote(x)
-        ),
-        dict(
-            name = "str_formatTab", field = "Abstract", assertion = lambda x : check_unacceptedCharactersTab(x)
-        ),
-        dict(
-            name = "str_formatQuote", field = "Source", assertion = lambda x : check_unacceptedCharactersQuote(x)
-        ),
-        dict(
-            name = "str_formatTab", field = "Source", assertion = lambda x : check_unacceptedCharactersTab(x)
-        )
-    ]
-
-    fileDelimiter = csv.Sniffer().sniff(file.read().decode(file_encoding)).delimiter
-    file.seek(0)
-    
-    try:
-        fileContent = pd.read_csv(file, sep=fileDelimiter)
-    except Exception:
-        return False, None
-    
-    dataTable = fileContent.values.tolist()
-    dataTable.insert(0, fileContent.columns.to_list())
-
-    problemCells = etl.validate(dataTable, constraints, header)
-
-    if problemCells.len() > 1:
-        return False, problemCells
-    else:
-        return True, None
    
\ No newline at end of file
--- a/openapi.yaml
+++ b/openapi.yaml
@@ -83,7 +83,9 @@ paths:
                    example:
                      - Could not read file
                      - Incorrect file - File is not compatible with GarganText
-                      - Incorrect file - File is malformed
+                      - File delimiter not found or Incorrect file delimiter, should be a tabulation
+                      - Some column names were not found ...
+                      - Unexpected error in file
        '500':
          description: Unexpected Error
  /contexts/{from}:

--- a/tests/test_contexts_utils.py
+++ b/tests/test_contexts_utils.py
+from io import BytesIO
+import pytest
+
+from gargantools.utils.contexts_utils import check_fileContent
+from gargantools.utils.utils import get_fileContent
+
+
+@pytest.mark.parametrize("test_file_copy", ["correct.csv"], indirect=True)
+def test_check_fileContent(test_file_copy):
+    with open(test_file_copy, "r") as f:
+        f_data = BytesIO(f.read().encode("utf-8"))
+        assert check_fileContent(get_fileContent(f_data, '\t')) == (True, None)
--- a/tests/test_contexts_validate.py
+++ b/tests/test_contexts_validate.py
@@ -10,30 +10,31 @@ def test_missingFile(client):
    assert response.data == b"Bad request, missing file\n"
    assert response.status_code == 400

-@pytest.mark.parametrize("test_file_copy", ["incorrectExtention"], indirect=True)
-def test_fileExtensionError(client, test_file_copy):
-    with open(test_file_copy, "rb") as f:
-        f_data = BytesIO(f.read())
-        file_storage = FileStorage(f_data, filename=test_file_copy)
-        response = client.post('/contexts', data={'file': file_storage}, content_type='multipart/form-data')
-
-        assert response.data == b"Incorrect file format or file format not found\n"
-        assert response.status_code == 400
-
-def test_fileEncodingError(client):
-    
-    # TODO: write test
-
-    return
-
-@pytest.mark.parametrize("test_file_copy", ["incorrect.csv", "malformed.csv"], indirect=True)
+@pytest.mark.parametrize("test_file_copy", ["incorrect.csv", "malformed.csv", "incorrectDelimiter.csv", "incorrectHeader.csv", "incorrectExtention"], indirect=True)
 def test_incorrectFile(client, test_file_copy):
    with open(test_file_copy, "rb") as f:
        f_data = BytesIO(f.read())
        file_storage = FileStorage(f_data, filename=test_file_copy)
        response = client.post('/contexts', data={'file': file_storage}, content_type='multipart/form-data')

-        assert response.status_code == 422
+        print(test_file_copy)
+
+        if "incorrect.csv" in test_file_copy:
+            assert response.data == b'Incorrect file - File is not compatible with GarganText\n'
+        elif "malformed.csv" in test_file_copy:
+            assert response.data == b"Could not read the file\n"
+        elif "incorrectDelimiter.csv" in test_file_copy:
+            assert response.data == b"File delimiter not found or Incorrect file delimiter, should be a tabulation\n"
+        elif "incorrectHeader.csv" in test_file_copy:
+            assert b"Some column names were not found" in response.data
+        elif "incorrectExtention" in test_file_copy:
+            assert response.data == b"Incorrect file format or file format not found\n"
+            assert response.status_code == 400
+        else:
+            assert False
+
+        if "incorrectExtention" not in test_file_copy:
+            assert response.status_code == 422

 @pytest.mark.parametrize("test_file_copy", ["correct.csv"], indirect=True)
 def test_correctFile(client, test_file_copy):

--- a/tests/test_files/incorrectDelimiter.csv
+++ b/tests/test_files/incorrectDelimiter.csv
+Publication Day,Publication Month,Publication Year,Authors,Title,Abstract,Source
+31,12,1,an author,a ,title,an abstract,a source
+1,1,1,an author,"a """"title""""","an """"""""abstract""""""""",a source
--- a/tests/test_files/incorrectHeader.csv
+++ b/tests/test_files/incorrectHeader.csv
+Publication Month	Publication Year	Authors	Title	Abstract	Source
+12	1	an author	a ,title	an abstract	a source
+1	1	an author	"a """"title"""""	"an """"""""abstract"""""""""	a source
--- a/tests/test_utils.py
+++ b/tests/test_utils.py
 from io import BytesIO
 import os
 import pytest
+from gargantools.utils.contexts_utils import check_fileContent
 from gargantools.utils.utils import *


 TEST_FILES_DIR = os.path.join(os.path.dirname(__file__), 'test_files')

-
 def test_check_fileExtension():
    list_trueFile = ['a.csv', 'a.tsv', 'a.pdf', 'a.html', 'a.txt', 'a.b.c.csv']
    list_falseFile = ['a', 'a.c']
@@ -33,26 +33,32 @@ def test_get_fileName():
    assert get_fileName("afilename.csv") == "afilename"
    assert get_fileName("afile.name.csv") == "afile.name"

+@pytest.mark.parametrize("test_file_copy", ["correct.csv"], indirect=True)
+def test_get_fileDelimiter(test_file_copy):
+    with open(test_file_copy, "r") as f:
+        f_data = BytesIO(f.read().encode("utf-8"))
+        assert get_fileDelimiter(f_data, "utf-8") == "\t"
+
+@pytest.mark.parametrize("test_file_copy", ["correct.csv"], indirect=True)
+def test_get_fileContent(test_file_copy):
+    with open(test_file_copy, "r") as f:
+        f_data = BytesIO(f.read().encode("utf-8"))
+        assert get_fileContent(f_data, "\t") is not None
+
 def test_check_columnName():
+    correct_column = ["Publication Day", "Publication Month", "Publication Year", "Authors", "Title", "Abstract", "Source"]
    list_trueColumnName = ["Publication Day", "Publication Month", "Publication Year", "Authors", "Title", "Abstract", "Source"]
    list_falseColumnName = ["Publication day", "Publication Month", " Year", "Authors", "Title", "Abstract", "Source"]
    list_falseColumnName_more = ["Publication Day", "Publication Month", "Publication Year", "Authors", "Title", "Abstract", "Source", "azerty"]
    list_falseColumnName_less = ["Publication Day", "Publication Year", "Authors", "Title"]

-    assert check_columnName(list_trueColumnName) == (True, [])
-    assert check_columnName(list_falseColumnName) == (False, ["Publication Day", "Publication Year"])
-    assert check_columnName(list_falseColumnName_more) == (False, [])
-    assert check_columnName(list_falseColumnName_less) == (False, ["Publication Month", "Abstract", "Source"])
+    assert check_columnName(list_trueColumnName, correct_column) == (True, [])
+    assert check_columnName(list_falseColumnName, correct_column) == (False, ["Publication Day", "Publication Year"])
+    assert check_columnName(list_falseColumnName_more, correct_column) == (False, [])
+    assert check_columnName(list_falseColumnName_less, correct_column) == (False, ["Publication Month", "Abstract", "Source"])
    
 def test_check_date():
    assert check_date(1,1,1)
    assert not check_date(0,1,1)
    assert not check_date(2000,2,30)
    assert not check_date(2000,13,1)
-
-
-@pytest.mark.parametrize("test_file_copy", ["correct.csv"], indirect=True)
-def test_check_fileContent(test_file_copy):
-    with open(test_file_copy, "r") as f:
-        f_data = BytesIO(f.read().encode("utf-8"))
-        assert check_fileContent(f_data, "utf-8") == (True, None)