Commit b5fe01d8 authored by Marie FU's avatar Marie FU

enable user to convert all files from a directory for API and minor correction for Streamlit

parent 369bdd93
...@@ -49,3 +49,6 @@ en,correct_file,"The given file is correct." ...@@ -49,3 +49,6 @@ en,correct_file,"The given file is correct."
fr,new_file,"Télécharger le fichier TSV :" fr,new_file,"Télécharger le fichier TSV :"
en,new_file,"Download the TSV file : " en,new_file,"Download the TSV file : "
fr,err_detect_encoding,"L'encodage n'a pas pu être détecté"
en,err_detect_encoding,"Encoding could not be detected"
...@@ -6,14 +6,16 @@ Marie FU ...@@ -6,14 +6,16 @@ Marie FU
from io import StringIO from io import StringIO
import typing import typing
import chardet
import streamlit as st import streamlit as st
import csv import csv
import re import re
import codecs import codecs
import src.basic as tmp import src.basic as tmp
from streamlit.errors import StreamlitAPIException
# Define constant for file encoding supported (name in lower case) # Define constant for file encoding supported (name in lower case)
FILE_ENCODING = "utf-8" FILE_ENCODING = ["utf-8", "utf-8-sig", "ascii"]
# Define constant for GarganText TSV columns # Define constant for GarganText TSV columns
TSV_COLUMNS = ["Publication Day", "Publication Month", "Publication Year", "Authors", "Title", "Source", "Abstract"] TSV_COLUMNS = ["Publication Day", "Publication Month", "Publication Year", "Authors", "Title", "Source", "Abstract"]
...@@ -35,15 +37,19 @@ def checkEncoding() -> bool: ...@@ -35,15 +37,19 @@ def checkEncoding() -> bool:
(Boolean) : True if encoded correctly, False otherwise (Boolean) : True if encoded correctly, False otherwise
Authors: Authors:
Nicolas Atrax
Marie FU Marie FU
""" """
content = STATE.file.read() content = STATE.file.read()
try: STATE.encoding = chardet.detect(content)["encoding"]
content.decode(FILE_ENCODING) if STATE.encoding is not None:
if STATE.encoding.lower() not in FILE_ENCODING:
STATE.errMessageLog += st.session_state.general_text_dict['err_file_encoding']
return False
return True return True
except UnicodeDecodeError: else:
STATE.errMessageLog += st.session_state.general_text_dict['err_file_encoding'] STATE.errMessageLog += st.session_state.general_text_dict['err_detect_encoding']
return False return False
...@@ -64,7 +70,7 @@ def getSeparator() -> typing.Union[str,None]: ...@@ -64,7 +70,7 @@ def getSeparator() -> typing.Union[str,None]:
""" """
toStartOfFile() toStartOfFile()
line = STATE.file.readline().decode('utf-8') line = STATE.file.readline().decode(STATE.encoding)
if ',' in line: if ',' in line:
if '\t' in line or ';' in line: if '\t' in line or ';' in line:
...@@ -246,7 +252,7 @@ def getColumnsNames(separator : str) -> typing.Union[bool,None]: ...@@ -246,7 +252,7 @@ def getColumnsNames(separator : str) -> typing.Union[bool,None]:
""" """
registeredNames = [] registeredNames = []
line = StringIO(STATE.file.getvalue().decode("utf-8")).read().split("\n")[0].split(separator) line = StringIO(STATE.file.getvalue().decode(STATE.encoding)).read().split("\n")[0].split(separator)
othersColumns = [] othersColumns = []
for name in line: for name in line:
registeredNames, otherColumns = checkColumnNames(name,registeredNames, othersColumns) registeredNames, otherColumns = checkColumnNames(name,registeredNames, othersColumns)
...@@ -347,11 +353,11 @@ def getContent(separator : str) -> None: ...@@ -347,11 +353,11 @@ def getContent(separator : str) -> None:
csvLine = 2 csvLine = 2
reader = csv.DictReader(codecs.iterdecode(STATE.file, 'utf-8'), delimiter=separator) reader = csv.DictReader(codecs.iterdecode(STATE.file, STATE.encoding), delimiter=separator)
for row in reader: for row in reader:
for name, value in row.items(): for name, value in row.items():
if name in STATE.columnMap.keys() and value is not None: if name in STATE.columnMap.keys() and value is not None:
value = value.replace("\"","”") value = value.replace("\"","”").replace("\n"," ")
checkMissing(lowerName(name), value, csvLine) checkMissing(lowerName(name), value, csvLine)
csvLine += 1 csvLine += 1
STATE.fileData = STATE.fileData[:-1] + "\n" STATE.fileData = STATE.fileData[:-1] + "\n"
...@@ -376,7 +382,11 @@ def show_download_button() -> None: ...@@ -376,7 +382,11 @@ def show_download_button() -> None:
Marie FU Marie FU
""" """
st.download_button(STATE.newFileName, STATE.fileData, STATE.newFileName) try:
st.download_button(STATE.newFileName, STATE.fileData, STATE.newFileName)
except StreamlitAPIException:
st.write("duplicate file")
print("here")
return return
...@@ -411,7 +421,7 @@ def checkNewFileName() -> bool : ...@@ -411,7 +421,7 @@ def checkNewFileName() -> bool :
STATE.errMessageLog += st.session_state.general_text_dict['err_file_name'] STATE.errMessageLog += st.session_state.general_text_dict['err_file_name']
return False return False
else : else :
STATE.newFileName = "File.tsv" STATE.newFileName = STATE.file.name.split(".")[0] + ".tsv"
return True return True
@st.experimental_fragment @st.experimental_fragment
...@@ -430,43 +440,44 @@ st.write(st.session_state.general_text_dict['description']) ...@@ -430,43 +440,44 @@ st.write(st.session_state.general_text_dict['description'])
STATE.newFileName = st.text_input(label=st.session_state.general_text_dict['file_name_input'],placeholder="File.tsv") STATE.newFileName = st.text_input(label=st.session_state.general_text_dict['file_name_input'],placeholder="File.tsv")
st.file_uploader(st.session_state.general_text_dict['file'], type=["tsv", "csv"], key="file") st.file_uploader(st.session_state.general_text_dict['file'], type=["tsv", "csv"], key="file_",accept_multiple_files=True)
# checking if a file is uploaded # checking if a file is uploaded
if STATE.file is not None: if STATE.file_ is not None:
for f in STATE.file_:
# first utilisation of errMessageLog, contain the log of every error message encountered STATE.file = f
STATE.errMessageLog = "" STATE.newFileName = None
if not checkNewFileName() or not checkEncoding() or getSeparator() is None:
errDisplay() # first utilisation of errMessageLog, contain the log of every error message encountered
else : STATE.errMessageLog = ""
# set file pointer to start of file, will be reset each time between each file operation if not checkNewFileName() or not checkEncoding() or getSeparator() is None:
toStartOfFile() errDisplay()
else :
separator = getSeparator() # set file pointer to start of file, will be reset each time between each file operation
toStartOfFile()
STATE.columnMap = {}
if getColumnsNames(separator) : # type: ignore
toStartOfFile() toStartOfFile()
STATE.fileData = "" separator = getSeparator()
addColumnsNamestoTSV()
STATE.contentProblem = False
getContent(separator) # type: ignore
toStartOfFile() toStartOfFile()
if not STATE.contentProblem:
STATE.content = "" STATE.columnMap = {}
STATE.content = STATE.file.read().decode('utf-8') if getColumnsNames(separator) : # type: ignore
if STATE.content == STATE.fileData : toStartOfFile()
st.write(STATE.general_text_dict['correct_file'])
STATE.fileData = ""
addColumnsNamestoTSV()
STATE.contentProblem = False
getContent(separator) # type: ignore
toStartOfFile()
if not STATE.contentProblem:
STATE.content = ""
STATE.content = STATE.file.read().decode(STATE.encoding)
if STATE.content == STATE.fileData :
st.write(STATE.general_text_dict['correct_file'])
else:
st.write(st.session_state.general_text_dict['new_file'])
show_download_button()
else: else:
st.write(st.session_state.general_text_dict['new_file']) errDisplay()
show_download_button()
else:
errDisplay()
# STATE.file.close()
...@@ -7,10 +7,8 @@ import shutil ...@@ -7,10 +7,8 @@ import shutil
import zipfile import zipfile
import tempfile import tempfile
import os import os
import codecs
from datetime import date from datetime import date
import re import re
import chardet
import pandas as pd import pandas as pd
import streamlit as st import streamlit as st
import lib.tika.tika as tika import lib.tika.tika as tika
...@@ -178,14 +176,14 @@ def getInfo(): ...@@ -178,14 +176,14 @@ def getInfo():
def pdfToTSV(fileName, fileAddress, pdfDir): def pdfToTSV(fileName, fileAddress, pdfDir):
st.session_state.page = 1 st.session_state.page = 1
author, title, watermark = getInfo() author, title, watermark = getInfo()
tsv = "authors\tsource\tpublication_year\tpublication_month\tpublication_day\ttitle\tabstract\n" tsv = "Authors\tSource\tPublication Year\tpublication Month\tPublication_day\tTitle\tAbstract\n"
with st.spinner(st.session_state.general_text_dict['loading']): with st.spinner(st.session_state.general_text_dict['loading']):
tsv = segmentAbstract(fileName, fileAddress, tsv, author, title, tsv = segmentAbstract(fileName, fileAddress, tsv, author, title,
str(date.today().year), "1", "1", watermark) str(date.today().year), "1", "1", watermark)
if '/' in fileName: if '/' in fileName:
fileName = fileName.split('/')[1] fileName = fileName.split('/')[1]
with open(pdfDir + "/" + fileName.replace(".pdf", "(pdf).tsv"), "w", encoding="utf-8-sig") as file: with open(pdfDir + "/" + fileName.replace(".pdf", "(pdf).tsv"), "w") as file:
file.write(tsv) file.write(tsv)
tsv = "\n".join(tsv.split("\n")[1:]) tsv = "\n".join(tsv.split("\n")[1:])
...@@ -271,7 +269,7 @@ if st.session_state.page == 0: ...@@ -271,7 +269,7 @@ if st.session_state.page == 0:
if st.session_state.file != None: if st.session_state.file != None:
st.session_state.zipDir = tempfile.TemporaryDirectory() st.session_state.zipDir = tempfile.TemporaryDirectory()
st.session_state.pdfDir = tempfile.TemporaryDirectory() st.session_state.pdfDir = tempfile.TemporaryDirectory()
st.session_state.tsv = "authors\tsource\tpublication_year\tpublication_month\tpublication_day\ttitle\tabstract" st.session_state.tsv = "Authors\tSource\tPublication Year\tpublication Month\tPublication_day\tTitle\tAbstract\n"
st.session_state.page = 1 st.session_state.page = 1
extractAllPDF(st.session_state.zipDir.name, st.session_state.file) extractAllPDF(st.session_state.zipDir.name, st.session_state.file)
st.session_state.len = len( st.session_state.len = len(
...@@ -302,7 +300,7 @@ if st.session_state.page == 1: ...@@ -302,7 +300,7 @@ if st.session_state.page == 1:
askPDF(fileName) askPDF(fileName)
if st.session_state.page == 3: if st.session_state.page == 3:
with open(st.session_state.pdfDir.name + "/PDFCompilation.tsv", "w", encoding='utf-8-sig') as file: with open(st.session_state.pdfDir.name + "/PDFCompilation.tsv", "w") as file:
file.write(st.session_state.tsv) file.write(st.session_state.tsv)
shutil.make_archive(st.session_state.zipDir.name + shutil.make_archive(st.session_state.zipDir.name +
"/PDFCompilation", 'zip', st.session_state.pdfDir.name) "/PDFCompilation", 'zip', st.session_state.pdfDir.name)
......
import glob
import os
from pathlib import Path
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from flask import Flask, request from flask import Flask, request
from grobid_client.grobid_client import GrobidClient from grobid_client.grobid_client import GrobidClient
from numpy import number
app = Flask(__name__)
def requestGC() -> None: def requestGC() -> None:
...@@ -84,7 +87,10 @@ def getDate(soup : BeautifulSoup) -> str: ...@@ -84,7 +87,10 @@ def getDate(soup : BeautifulSoup) -> str:
date = soup.find('date') date = soup.find('date')
if date is not None: if date is not None:
if date is str: if date is str:
res = date['when'] try:
res = date['when']
except TypeError:
res = "1\t1\t1"
else: else:
res = "1\t1\t1" res = "1\t1\t1"
if "-" in res: if "-" in res:
...@@ -92,9 +98,9 @@ def getDate(soup : BeautifulSoup) -> str: ...@@ -92,9 +98,9 @@ def getDate(soup : BeautifulSoup) -> str:
dateTab = res.split("\t") dateTab = res.split("\t")
if len(dateTab) == 2 : if len(dateTab) == 2 :
res += "1" res += "\t1"
elif len(dateTab) == 1 : elif len(dateTab) == 1 :
res += "1\t1" res += "\t1\t1"
return res return res
...@@ -146,6 +152,11 @@ def getAuthors(soup : BeautifulSoup) -> str: ...@@ -146,6 +152,11 @@ def getAuthors(soup : BeautifulSoup) -> str:
return authors.strip() return authors.strip()
def empty_content() -> None:
for f in glob.glob("./book/file*"):
os.remove(f)
def getData(soup : BeautifulSoup, title : str|None, date : str, authors : str) -> str: def getData(soup : BeautifulSoup, title : str|None, date : str, authors : str) -> str:
""" """
...@@ -181,7 +192,7 @@ def getData(soup : BeautifulSoup, title : str|None, date : str, authors : str) - ...@@ -181,7 +192,7 @@ def getData(soup : BeautifulSoup, title : str|None, date : str, authors : str) -
# Loop in paragraphs # Loop in paragraphs
for paragraph in paragraphList: for paragraph in paragraphList:
fileData += date + "\t" fileData += date + "\t" + authors + "\t"
if source is not None and title is not None: if source is not None and title is not None:
fileData += title + ", " + source + "\t" fileData += title + ", " + source + "\t"
elif source is None and title is not None: elif source is None and title is not None:
...@@ -209,29 +220,67 @@ def getData(soup : BeautifulSoup, title : str|None, date : str, authors : str) - ...@@ -209,29 +220,67 @@ def getData(soup : BeautifulSoup, title : str|None, date : str, authors : str) -
fileData += "Title\tAbstract" fileData += "Title\tAbstract"
fileData += "\n" fileData += "\n"
fileData = "Publication Year\tPublication Month\tPublication Day\tSource\tTitle\tAbstract\tAuthors\n" + fileData.replace("\n","\t" + authors+"\n")
return fileData return fileData
@app.route("/", methods=['POST']) def checkXMLFile() -> bool:
def getBookTSV() -> str: if os.path.exists("./book/file.pdf"):
if request.method == 'POST': return True
# check if the post request has the file part empty_content()
if 'file' not in request.files: return False
return "no file\n"
def checkContent(data:str) -> str:
res = ""
for line in data.split("\n"):
if line != "":
if not line[0].isnumeric():
res = res[:-1] + line + "\n"
else:
res += line + "\n"
return res.replace("\"","”")
def create_app():
app = Flask(__name__)
@app.route("/getFile", methods=['POST'])
def getBookTSV() -> str:
if request.method == 'POST':
# check if the post request has the file part
if 'dir' not in request.form:
return "no directory path, curl needs to be like -- curl -X POST -F dir='my_directory_path' url --\n"
elif not os.path.exists(request.form["dir"]):
print(request.form["dir"])
return "directory does not exist, directory path should be absolute\n"
else:
dirName = request.form["dir"]
files = Path(dirName).glob('*.pdf')
allData = ""
for file in files:
print(file.name)
with open("book/file.pdf","wb") as f:
f.write(file.read_bytes())
requestGC()
if not checkXMLFile():
return ""
soup = getXMLContent()
title = getBookTitle(soup)
date = getDate(soup)
file = request.files['file'] authors = getAuthors(soup)
with open("book/file.pdf","wb") as f:
f.write(file.read())
requestGC()
soup = getXMLContent() allData += getData(soup, title, date, authors)
title = getBookTitle(soup) empty_content()
date = getDate(soup) return "Publication Year\tPublication Month\tPublication Day\tAuthors\tSource\tTitle\tAbstract\n" + checkContent(allData)
return ""
authors = getAuthors(soup) return app
return getData(soup, title, date, authors) create_app()
return ""
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment