Commit b5fe01d8 authored by Marie FU's avatar Marie FU

enable user to convert all files from a directory for API and minor correction for Streamlit

parent 369bdd93
......@@ -49,3 +49,6 @@ en,correct_file,"The given file is correct."
fr,new_file,"Télécharger le fichier TSV :"
en,new_file,"Download the TSV file : "
fr,err_detect_encoding,"L'encodage n'a pas pu être détecté"
en,err_detect_encoding,"Encoding could not be detected"
......@@ -6,14 +6,16 @@ Marie FU
from io import StringIO
import typing
import chardet
import streamlit as st
import csv
import re
import codecs
import src.basic as tmp
from streamlit.errors import StreamlitAPIException
# Define constant for file encoding supported (name in lower case)
FILE_ENCODING = "utf-8"
FILE_ENCODING = ["utf-8", "utf-8-sig", "ascii"]
# Define constant for GarganText TSV columns
TSV_COLUMNS = ["Publication Day", "Publication Month", "Publication Year", "Authors", "Title", "Source", "Abstract"]
......@@ -35,15 +37,19 @@ def checkEncoding() -> bool:
(Boolean) : True if encoded correctly, False otherwise
Authors:
Nicolas Atrax
Marie FU
"""
content = STATE.file.read()
try:
content.decode(FILE_ENCODING)
STATE.encoding = chardet.detect(content)["encoding"]
if STATE.encoding is not None:
if STATE.encoding.lower() not in FILE_ENCODING:
STATE.errMessageLog += st.session_state.general_text_dict['err_file_encoding']
return False
return True
except UnicodeDecodeError:
STATE.errMessageLog += st.session_state.general_text_dict['err_file_encoding']
else:
STATE.errMessageLog += st.session_state.general_text_dict['err_detect_encoding']
return False
......@@ -64,7 +70,7 @@ def getSeparator() -> typing.Union[str,None]:
"""
toStartOfFile()
line = STATE.file.readline().decode('utf-8')
line = STATE.file.readline().decode(STATE.encoding)
if ',' in line:
if '\t' in line or ';' in line:
......@@ -246,7 +252,7 @@ def getColumnsNames(separator : str) -> typing.Union[bool,None]:
"""
registeredNames = []
line = StringIO(STATE.file.getvalue().decode("utf-8")).read().split("\n")[0].split(separator)
line = StringIO(STATE.file.getvalue().decode(STATE.encoding)).read().split("\n")[0].split(separator)
othersColumns = []
for name in line:
registeredNames, otherColumns = checkColumnNames(name,registeredNames, othersColumns)
......@@ -347,11 +353,11 @@ def getContent(separator : str) -> None:
csvLine = 2
reader = csv.DictReader(codecs.iterdecode(STATE.file, 'utf-8'), delimiter=separator)
reader = csv.DictReader(codecs.iterdecode(STATE.file, STATE.encoding), delimiter=separator)
for row in reader:
for name, value in row.items():
if name in STATE.columnMap.keys() and value is not None:
value = value.replace("\"","”")
value = value.replace("\"","”").replace("\n"," ")
checkMissing(lowerName(name), value, csvLine)
csvLine += 1
STATE.fileData = STATE.fileData[:-1] + "\n"
......@@ -376,7 +382,11 @@ def show_download_button() -> None:
Marie FU
"""
st.download_button(STATE.newFileName, STATE.fileData, STATE.newFileName)
try:
st.download_button(STATE.newFileName, STATE.fileData, STATE.newFileName)
except StreamlitAPIException:
st.write("duplicate file")
print("here")
return
......@@ -411,7 +421,7 @@ def checkNewFileName() -> bool :
STATE.errMessageLog += st.session_state.general_text_dict['err_file_name']
return False
else :
STATE.newFileName = "File.tsv"
STATE.newFileName = STATE.file.name.split(".")[0] + ".tsv"
return True
@st.experimental_fragment
......@@ -430,43 +440,44 @@ st.write(st.session_state.general_text_dict['description'])
STATE.newFileName = st.text_input(label=st.session_state.general_text_dict['file_name_input'],placeholder="File.tsv")
st.file_uploader(st.session_state.general_text_dict['file'], type=["tsv", "csv"], key="file")
st.file_uploader(st.session_state.general_text_dict['file'], type=["tsv", "csv"], key="file_",accept_multiple_files=True)
# checking if a file is uploaded
if STATE.file is not None:
# first utilisation of errMessageLog, contain the log of every error message encountered
STATE.errMessageLog = ""
if not checkNewFileName() or not checkEncoding() or getSeparator() is None:
errDisplay()
else :
# set file pointer to start of file, will be reset each time between each file operation
toStartOfFile()
separator = getSeparator()
toStartOfFile()
STATE.columnMap = {}
if getColumnsNames(separator) : # type: ignore
if STATE.file_ is not None:
for f in STATE.file_:
STATE.file = f
STATE.newFileName = None
# first utilisation of errMessageLog, contain the log of every error message encountered
STATE.errMessageLog = ""
if not checkNewFileName() or not checkEncoding() or getSeparator() is None:
errDisplay()
else :
# set file pointer to start of file, will be reset each time between each file operation
toStartOfFile()
STATE.fileData = ""
addColumnsNamestoTSV()
STATE.contentProblem = False
getContent(separator) # type: ignore
separator = getSeparator()
toStartOfFile()
if not STATE.contentProblem:
STATE.content = ""
STATE.content = STATE.file.read().decode('utf-8')
if STATE.content == STATE.fileData :
st.write(STATE.general_text_dict['correct_file'])
STATE.columnMap = {}
if getColumnsNames(separator) : # type: ignore
toStartOfFile()
STATE.fileData = ""
addColumnsNamestoTSV()
STATE.contentProblem = False
getContent(separator) # type: ignore
toStartOfFile()
if not STATE.contentProblem:
STATE.content = ""
STATE.content = STATE.file.read().decode(STATE.encoding)
if STATE.content == STATE.fileData :
st.write(STATE.general_text_dict['correct_file'])
else:
st.write(st.session_state.general_text_dict['new_file'])
show_download_button()
else:
st.write(st.session_state.general_text_dict['new_file'])
show_download_button()
else:
errDisplay()
# STATE.file.close()
errDisplay()
......@@ -7,10 +7,8 @@ import shutil
import zipfile
import tempfile
import os
import codecs
from datetime import date
import re
import chardet
import pandas as pd
import streamlit as st
import lib.tika.tika as tika
......@@ -178,14 +176,14 @@ def getInfo():
def pdfToTSV(fileName, fileAddress, pdfDir):
st.session_state.page = 1
author, title, watermark = getInfo()
tsv = "authors\tsource\tpublication_year\tpublication_month\tpublication_day\ttitle\tabstract\n"
tsv = "Authors\tSource\tPublication Year\tpublication Month\tPublication_day\tTitle\tAbstract\n"
with st.spinner(st.session_state.general_text_dict['loading']):
tsv = segmentAbstract(fileName, fileAddress, tsv, author, title,
str(date.today().year), "1", "1", watermark)
if '/' in fileName:
fileName = fileName.split('/')[1]
with open(pdfDir + "/" + fileName.replace(".pdf", "(pdf).tsv"), "w", encoding="utf-8-sig") as file:
with open(pdfDir + "/" + fileName.replace(".pdf", "(pdf).tsv"), "w") as file:
file.write(tsv)
tsv = "\n".join(tsv.split("\n")[1:])
......@@ -271,7 +269,7 @@ if st.session_state.page == 0:
if st.session_state.file != None:
st.session_state.zipDir = tempfile.TemporaryDirectory()
st.session_state.pdfDir = tempfile.TemporaryDirectory()
st.session_state.tsv = "authors\tsource\tpublication_year\tpublication_month\tpublication_day\ttitle\tabstract"
st.session_state.tsv = "Authors\tSource\tPublication Year\tpublication Month\tPublication_day\tTitle\tAbstract\n"
st.session_state.page = 1
extractAllPDF(st.session_state.zipDir.name, st.session_state.file)
st.session_state.len = len(
......@@ -302,7 +300,7 @@ if st.session_state.page == 1:
askPDF(fileName)
if st.session_state.page == 3:
with open(st.session_state.pdfDir.name + "/PDFCompilation.tsv", "w", encoding='utf-8-sig') as file:
with open(st.session_state.pdfDir.name + "/PDFCompilation.tsv", "w") as file:
file.write(st.session_state.tsv)
shutil.make_archive(st.session_state.zipDir.name +
"/PDFCompilation", 'zip', st.session_state.pdfDir.name)
......
import glob
import os
from pathlib import Path
from bs4 import BeautifulSoup
from flask import Flask, request
from grobid_client.grobid_client import GrobidClient
from numpy import number
app = Flask(__name__)
def requestGC() -> None:
......@@ -84,7 +87,10 @@ def getDate(soup : BeautifulSoup) -> str:
date = soup.find('date')
if date is not None:
if date is str:
res = date['when']
try:
res = date['when']
except TypeError:
res = "1\t1\t1"
else:
res = "1\t1\t1"
if "-" in res:
......@@ -92,9 +98,9 @@ def getDate(soup : BeautifulSoup) -> str:
dateTab = res.split("\t")
if len(dateTab) == 2 :
res += "1"
res += "\t1"
elif len(dateTab) == 1 :
res += "1\t1"
res += "\t1\t1"
return res
......@@ -146,6 +152,11 @@ def getAuthors(soup : BeautifulSoup) -> str:
return authors.strip()
def empty_content() -> None:
for f in glob.glob("./book/file*"):
os.remove(f)
def getData(soup : BeautifulSoup, title : str|None, date : str, authors : str) -> str:
"""
......@@ -181,7 +192,7 @@ def getData(soup : BeautifulSoup, title : str|None, date : str, authors : str) -
# Loop in paragraphs
for paragraph in paragraphList:
fileData += date + "\t"
fileData += date + "\t" + authors + "\t"
if source is not None and title is not None:
fileData += title + ", " + source + "\t"
elif source is None and title is not None:
......@@ -209,29 +220,67 @@ def getData(soup : BeautifulSoup, title : str|None, date : str, authors : str) -
fileData += "Title\tAbstract"
fileData += "\n"
fileData = "Publication Year\tPublication Month\tPublication Day\tSource\tTitle\tAbstract\tAuthors\n" + fileData.replace("\n","\t" + authors+"\n")
return fileData
@app.route("/", methods=['POST'])
def getBookTSV() -> str:
if request.method == 'POST':
# check if the post request has the file part
if 'file' not in request.files:
return "no file\n"
def checkXMLFile() -> bool:
if os.path.exists("./book/file.pdf"):
return True
empty_content()
return False
def checkContent(data:str) -> str:
res = ""
for line in data.split("\n"):
if line != "":
if not line[0].isnumeric():
res = res[:-1] + line + "\n"
else:
res += line + "\n"
return res.replace("\"","”")
def create_app():
app = Flask(__name__)
@app.route("/getFile", methods=['POST'])
def getBookTSV() -> str:
if request.method == 'POST':
# check if the post request has the file part
if 'dir' not in request.form:
return "no directory path, curl needs to be like -- curl -X POST -F dir='my_directory_path' url --\n"
elif not os.path.exists(request.form["dir"]):
print(request.form["dir"])
return "directory does not exist, directory path should be absolute\n"
else:
dirName = request.form["dir"]
files = Path(dirName).glob('*.pdf')
allData = ""
for file in files:
print(file.name)
with open("book/file.pdf","wb") as f:
f.write(file.read_bytes())
requestGC()
if not checkXMLFile():
return ""
soup = getXMLContent()
title = getBookTitle(soup)
date = getDate(soup)
file = request.files['file']
with open("book/file.pdf","wb") as f:
f.write(file.read())
requestGC()
authors = getAuthors(soup)
soup = getXMLContent()
allData += getData(soup, title, date, authors)
title = getBookTitle(soup)
empty_content()
date = getDate(soup)
return "Publication Year\tPublication Month\tPublication Day\tAuthors\tSource\tTitle\tAbstract\n" + checkContent(allData)
return ""
authors = getAuthors(soup)
return app
return getData(soup, title, date, authors)
return ""
create_app()
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment