enable user to convert all files from a directory for API and minor correction for Streamlit

b5fe01d8 · Marie FU · 369bdd93 · b5fe01d8 · b5fe01d8 · b5fe01d8
Commit b5fe01d8 authored Jun 04, 2024 by Marie FU
5 changed files
--- a/Streamlit/lang/text_CleanCSVtoTSV.csv
+++ b/Streamlit/lang/text_CleanCSVtoTSV.csv
@@ -49,3 +49,6 @@ en,correct_file,"The given file is correct."

 fr,new_file,"Télécharger le fichier TSV :"
 en,new_file,"Download the TSV file : "
+
+fr,err_detect_encoding,"L'encodage n'a pas pu être détecté"
+en,err_detect_encoding,"Encoding could not be detected"
--- a/Streamlit/pages/Clean_CSV_to_TSV.py
+++ b/Streamlit/pages/Clean_CSV_to_TSV.py
@@ -6,14 +6,16 @@ Marie FU

 from io import StringIO
 import typing
+import chardet
 import streamlit as st
 import csv
 import re
 import codecs
 import src.basic as tmp
+from streamlit.errors import StreamlitAPIException

 # Define constant for file encoding supported (name in lower case)
-FILE_ENCODING = "utf-8"
+FILE_ENCODING = ["utf-8", "utf-8-sig", "ascii"]

 # Define constant for GarganText TSV columns
 TSV_COLUMNS = ["Publication Day", "Publication Month", "Publication Year", "Authors", "Title", "Source", "Abstract"]
@@ -35,15 +37,19 @@ def checkEncoding() -> bool:
        (Boolean) : True if encoded correctly, False otherwise

    Authors:
+        Nicolas Atrax
        Marie FU
    """

    content = STATE.file.read()
-    try:
-        content.decode(FILE_ENCODING)
+    STATE.encoding = chardet.detect(content)["encoding"]
+    if STATE.encoding is not None:
+        if STATE.encoding.lower() not in FILE_ENCODING:
+            STATE.errMessageLog += st.session_state.general_text_dict['err_file_encoding']
+            return False
        return True
-    except UnicodeDecodeError:
-        STATE.errMessageLog += st.session_state.general_text_dict['err_file_encoding']
+    else:
+        STATE.errMessageLog += st.session_state.general_text_dict['err_detect_encoding']
        return False


@@ -64,7 +70,7 @@ def getSeparator() -> typing.Union[str,None]:
    """

    toStartOfFile()
-    line = STATE.file.readline().decode('utf-8')
+    line = STATE.file.readline().decode(STATE.encoding)

    if ',' in line:
        if '\t' in line or ';' in line:
@@ -246,7 +252,7 @@ def getColumnsNames(separator : str) -> typing.Union[bool,None]:
    """

    registeredNames = []
-    line = StringIO(STATE.file.getvalue().decode("utf-8")).read().split("\n")[0].split(separator)
+    line = StringIO(STATE.file.getvalue().decode(STATE.encoding)).read().split("\n")[0].split(separator)
    othersColumns = []
    for name in line:
        registeredNames, otherColumns = checkColumnNames(name,registeredNames, othersColumns)
@@ -347,11 +353,11 @@ def getContent(separator : str) -> None:
    
    csvLine = 2

-    reader = csv.DictReader(codecs.iterdecode(STATE.file, 'utf-8'), delimiter=separator)
+    reader = csv.DictReader(codecs.iterdecode(STATE.file, STATE.encoding), delimiter=separator)
    for row in reader:
        for name, value in row.items():
            if name in STATE.columnMap.keys() and value is not None:
-                value = value.replace("\"","”")
+                value = value.replace("\"","”").replace("\n"," ")
                checkMissing(lowerName(name), value, csvLine)
        csvLine += 1
        STATE.fileData = STATE.fileData[:-1] + "\n"
@@ -376,7 +382,11 @@ def show_download_button() -> None:
        Marie FU
    """

-    st.download_button(STATE.newFileName, STATE.fileData, STATE.newFileName)
+    try:
+        st.download_button(STATE.newFileName, STATE.fileData, STATE.newFileName)
+    except StreamlitAPIException:
+        st.write("duplicate file")
+        print("here")
    return


@@ -411,7 +421,7 @@ def checkNewFileName() -> bool :
            STATE.errMessageLog += st.session_state.general_text_dict['err_file_name']
            return False
    else :
-        STATE.newFileName = "File.tsv"
+        STATE.newFileName = STATE.file.name.split(".")[0] + ".tsv"
        return True

 @st.experimental_fragment
@@ -430,43 +440,44 @@ st.write(st.session_state.general_text_dict['description'])

 STATE.newFileName = st.text_input(label=st.session_state.general_text_dict['file_name_input'],placeholder="File.tsv")

-st.file_uploader(st.session_state.general_text_dict['file'], type=["tsv", "csv"], key="file")
+st.file_uploader(st.session_state.general_text_dict['file'], type=["tsv", "csv"], key="file_",accept_multiple_files=True)

 # checking if a file is uploaded
-if STATE.file is not None:
-
-    # first utilisation of errMessageLog, contain the log of every error message encountered
-    STATE.errMessageLog = ""
-    if not checkNewFileName() or not checkEncoding() or getSeparator() is None:
-        errDisplay()
-    else :
-        # set file pointer to start of file, will be reset each time between each file operation
-        toStartOfFile()
-
-        separator = getSeparator()
-
-        toStartOfFile()
-     
-        STATE.columnMap = {}
-        if getColumnsNames(separator) : # type: ignore
+if STATE.file_ is not None:
+    for f in STATE.file_:
+        STATE.file = f
+        STATE.newFileName = None
+
+        # first utilisation of errMessageLog, contain the log of every error message encountered
+        STATE.errMessageLog = ""
+        if not checkNewFileName() or not checkEncoding() or getSeparator() is None:
+            errDisplay()
+        else :
+            # set file pointer to start of file, will be reset each time between each file operation
            toStartOfFile()

-            STATE.fileData = ""
-            addColumnsNamestoTSV()
-
-            STATE.contentProblem = False
-            getContent(separator) # type: ignore
+            separator = getSeparator()

            toStartOfFile()
-            if not STATE.contentProblem:
-                STATE.content = ""
-                STATE.content = STATE.file.read().decode('utf-8')
-                if STATE.content == STATE.fileData :
-                    st.write(STATE.general_text_dict['correct_file'])
+        
+            STATE.columnMap = {}
+            if getColumnsNames(separator) : # type: ignore
+                toStartOfFile()
+
+                STATE.fileData = ""
+                addColumnsNamestoTSV()
+
+                STATE.contentProblem = False
+                getContent(separator) # type: ignore
+
+                toStartOfFile()
+                if not STATE.contentProblem:
+                    STATE.content = ""
+                    STATE.content = STATE.file.read().decode(STATE.encoding)
+                    if STATE.content == STATE.fileData :
+                        st.write(STATE.general_text_dict['correct_file'])
+                    else:
+                        st.write(st.session_state.general_text_dict['new_file'])
+                        show_download_button()
                else:
-                    st.write(st.session_state.general_text_dict['new_file'])
-                    show_download_button()
-            else:
-                errDisplay()
-
-    # STATE.file.close()
+                    errDisplay()
--- a/Streamlit/pages/PDF_to_TSV.py
+++ b/Streamlit/pages/PDF_to_TSV.py
@@ -7,10 +7,8 @@ import shutil
 import zipfile
 import tempfile
 import os
-import codecs
 from datetime import date
 import re
-import chardet
 import pandas as pd
 import streamlit as st
 import lib.tika.tika as tika
@@ -178,14 +176,14 @@ def getInfo():
 def pdfToTSV(fileName, fileAddress, pdfDir):
    st.session_state.page = 1
    author, title, watermark = getInfo()
-    tsv = "authors\tsource\tpublication_year\tpublication_month\tpublication_day\ttitle\tabstract\n"
+    tsv = "Authors\tSource\tPublication Year\tpublication Month\tPublication_day\tTitle\tAbstract\n"
    with st.spinner(st.session_state.general_text_dict['loading']):
        tsv = segmentAbstract(fileName, fileAddress, tsv, author, title,
                              str(date.today().year), "1", "1", watermark)

    if '/' in fileName:
        fileName = fileName.split('/')[1]
-    with open(pdfDir + "/" + fileName.replace(".pdf", "(pdf).tsv"), "w", encoding="utf-8-sig") as file:
+    with open(pdfDir + "/" + fileName.replace(".pdf", "(pdf).tsv"), "w") as file:
        file.write(tsv)

    tsv = "\n".join(tsv.split("\n")[1:])
@@ -271,7 +269,7 @@ if st.session_state.page == 0:
        if st.session_state.file != None:
            st.session_state.zipDir = tempfile.TemporaryDirectory()
            st.session_state.pdfDir = tempfile.TemporaryDirectory()
-            st.session_state.tsv = "authors\tsource\tpublication_year\tpublication_month\tpublication_day\ttitle\tabstract"
+            st.session_state.tsv = "Authors\tSource\tPublication Year\tpublication Month\tPublication_day\tTitle\tAbstract\n"
            st.session_state.page = 1
            extractAllPDF(st.session_state.zipDir.name, st.session_state.file)
            st.session_state.len = len(
@@ -302,7 +300,7 @@ if st.session_state.page == 1:
    askPDF(fileName)

 if st.session_state.page == 3:
-    with open(st.session_state.pdfDir.name + "/PDFCompilation.tsv", "w", encoding='utf-8-sig') as file:
+    with open(st.session_state.pdfDir.name + "/PDFCompilation.tsv", "w") as file:
        file.write(st.session_state.tsv)
    shutil.make_archive(st.session_state.zipDir.name +
                        "/PDFCompilation", 'zip', st.session_state.pdfDir.name)

--- a/apiBookToTSV/book/.placeholder
+++ b/apiBookToTSV/book/.placeholder
--- a/apiBookToTSV/src/BookToTSVAPI.py
+++ b/apiBookToTSV/src/BookToTSVAPI.py
+import glob
+import os
+from pathlib import Path
 from bs4 import BeautifulSoup
 from flask import Flask, request
 from grobid_client.grobid_client import GrobidClient
+from numpy import number

-app = Flask(__name__)

 def requestGC() -> None:

@@ -84,7 +87,10 @@ def getDate(soup : BeautifulSoup) -> str:
    date = soup.find('date')
    if date is not None:
        if date is str:
-            res = date['when']
+            try:
+                res = date['when']
+            except TypeError:
+                res = "1\t1\t1"
        else:
            res = "1\t1\t1"
    if "-" in res:
@@ -92,9 +98,9 @@ def getDate(soup : BeautifulSoup) -> str:

    dateTab = res.split("\t")
    if len(dateTab) == 2 :
-        res += "1"
+        res += "\t1"
    elif len(dateTab) == 1 :
-        res += "1\t1"
+        res += "\t1\t1"

    return res

@@ -146,6 +152,11 @@ def getAuthors(soup : BeautifulSoup) -> str:
    return authors.strip()


+def empty_content() -> None:
+    for f in glob.glob("./book/file*"):
+        os.remove(f)
+
+
 def getData(soup : BeautifulSoup, title : str|None, date : str, authors : str) -> str:
        
    """
@@ -181,7 +192,7 @@ def getData(soup : BeautifulSoup, title : str|None, date : str, authors : str) -

            # Loop in paragraphs
            for paragraph in paragraphList:
-                fileData += date + "\t"
+                fileData += date + "\t" + authors + "\t"
                if source is not None and title is not None:
                    fileData += title + ", " + source + "\t"
                elif source is None and title is not None:
@@ -209,29 +220,67 @@ def getData(soup : BeautifulSoup, title : str|None, date : str, authors : str) -
                    fileData += "Title\tAbstract"
                fileData += "\n"

-    fileData = "Publication Year\tPublication Month\tPublication Day\tSource\tTitle\tAbstract\tAuthors\n" + fileData.replace("\n","\t" + authors+"\n")
    return fileData


-@app.route("/", methods=['POST'])
-def getBookTSV() -> str:
-    if request.method == 'POST':
-        # check if the post request has the file part
-        if 'file' not in request.files:
-            return "no file\n"
+def checkXMLFile() -> bool:
+    if os.path.exists("./book/file.pdf"):
+        return True
+    empty_content()
+    return False
+
+
+def checkContent(data:str) -> str:
+    res = ""
+    for line in data.split("\n"):
+        if line != "":
+            if not line[0].isnumeric():
+                res = res[:-1] + line + "\n"
+            else:
+                res += line + "\n"
+    return res.replace("\"","”")
+
+
+def create_app():
+    app = Flask(__name__)
+
+    @app.route("/getFile", methods=['POST'])
+    def getBookTSV() -> str:
+        if request.method == 'POST':
+            # check if the post request has the file part
+            if 'dir' not in request.form:
+                return "no directory path, curl needs to be like -- curl -X POST -F dir='my_directory_path' url --\n"
+            elif not os.path.exists(request.form["dir"]):
+                print(request.form["dir"])
+                return "directory does not exist, directory path should be absolute\n"
+            else:
+                dirName = request.form["dir"]
+                files = Path(dirName).glob('*.pdf')
+                allData = ""
+                for file in files:
+                    print(file.name)
+                    with open("book/file.pdf","wb") as f:
+                        f.write(file.read_bytes())
+                    requestGC()
+
+                    if not checkXMLFile():
+                        return ""
+
+                    soup = getXMLContent()
+
+                    title = getBookTitle(soup)
+
+                    date = getDate(soup)

-        file = request.files['file']
-        with open("book/file.pdf","wb") as f:
-            f.write(file.read())
-        requestGC()
+                    authors = getAuthors(soup)

-        soup = getXMLContent()
+                    allData += getData(soup, title, date, authors)

-        title = getBookTitle(soup)
+                    empty_content()

-        date = getDate(soup)
+                return "Publication Year\tPublication Month\tPublication Day\tAuthors\tSource\tTitle\tAbstract\n" + checkContent(allData)
+        return ""

-        authors = getAuthors(soup)
+    return app

-        return getData(soup, title, date, authors)
-    return ""
+create_app()