Commit 0f423675 authored by Loïc Chapron's avatar Loïc Chapron

JsonCorpusToTSV

parent 9aecec67
#######
# jsonCorpusToCsv.py
# description : change a json GarganText corpus into a csv legacy corpus
# licence : AGPL + CECILL v3
# author : quentin lobbé - qlobbe@iscpif.fr
#######
# python3 jsonCorpusToCsv.py corpus.json
import sys
import csv
import json
from zipfile import ZipFile
try :
pathCorpus = sys.argv[1]
except :
print ("! args error\n")
sys.exit(0)
def readZipFile(path):
with ZipFile(path, 'r') as f:
file = f.open(f.namelist()[0])
return json.load(file)
def readJson(path) :
file = open(path)
return json.load(file)
if pathCorpus.split('.')[1] == 'zip':
corpusJson = readZipFile(pathCorpus)
else:
corpusJson = readJson(pathCorpus)
output = open(str(pathCorpus.split('.')[0]) + ".csv", "w")
header = "title\tsource\tpublication_year\tpublication_month\tpublication_day\tabstract\tauthors\tweight\n"
output.write(header)
for row in corpusJson['corpus'] :
doc = row['document']['hyperdata']
abstract = "empty"
authors = "empty"
title = "empty"
source = "empty"
if 'title' in doc.keys() :
title = doc['title'].replace('"','').replace('\t','')
if 'source' in doc.keys() :
source = doc['source'].replace('"','').replace('\t','')
if 'abstract' in doc.keys() :
abstract = doc['abstract'].replace('"','').replace('\t','')
if 'authors' in doc.keys() :
authors = doc['authors']
output_row = title + "\t" + source + "\t" + str(doc['publication_year']) + "\t" + str(doc['publication_month']) + "\t" + str(doc['publication_day']) + "\t" + abstract + "\t" + authors + "\t" + str(1) + "\n"
output.write(output_row)
\ No newline at end of file
# JsonCorpusToTSV
## About The project
JsonCorpusToTSv transform a JsonCorpus from Gargantext into a TSV corpus.
## Usage
```shell
python3 JsonCorpusToTSV.py corpus.json
```
corpus.json -> GarganText corpus in json format
Output a TSV legacy corpus : corpus.csv
You can also use a zip file with a json corpus in it
```shell
python3 JsonCorpusToTSV.py corpus.zip
```
\ No newline at end of file
This source diff could not be displayed because it is too large. You can view the blob instead.
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment