Commit 401c86e5 authored by Alexandre Delanoë's avatar Alexandre Delanoë

[TEXT][Parser] Hal CSV parser.

parent 95cd84b4
......@@ -42,20 +42,20 @@ import Gargantext.Database.NodeNgramsNgrams (NodeNgramsNgramsPoly(..), insertNod
import Gargantext.Database.Types.Node (HyperdataDocument(..))
import Gargantext.Database.User (getUser, UserLight(..), Username)
import Gargantext.Prelude
import Gargantext.Text.Parsers (parseDocs, FileFormat(WOS))
import Gargantext.Text.Parsers (parseDocs, FileFormat)
type UserId = Int
type RootId = Int
type CorpusId = Int
flowDatabase :: FilePath -> CorpusName -> IO [Int]
flowDatabase fp cName = do
flowDatabase :: FileFormat -> FilePath -> CorpusName -> IO [Int]
flowDatabase ff fp cName = do
-- Corus Flow
(masterUserId, _, corpusId) <- subFlow "gargantua" "Big Corpus"
-- Documents Flow
hyperdataDocuments <- map addUniqIds <$> parseDocs WOS fp
hyperdataDocuments <- map addUniqIds <$> parseDocs ff fp
ids <- runCmd' $ insertDocuments masterUserId corpusId hyperdataDocuments
printDebug "Docs IDs : " (length ids)
idsRepeat <- runCmd' $ insertDocuments masterUserId corpusId hyperdataDocuments
......
......@@ -56,6 +56,7 @@ import Gargantext.Prelude
import Gargantext.Database.Types.Node (HyperdataDocument(..))
import Gargantext.Text.Parsers.WOS (wosParser)
import Gargantext.Text.Parsers.Date (parseDate)
import Gargantext.Text.Parsers.CSV (parseHal)
import Gargantext.Text.Terms.Stop (detectLang)
------------------------------------------------------------------------
......@@ -70,7 +71,7 @@ type ParseError = String
-- | According to the format of Input file,
-- different parser are available.
data FileFormat = WOS
data FileFormat = WOS | CsvHalFormat -- | CsvGargV3
deriving (Show)
-- Implemented (ISI Format)
......@@ -86,11 +87,10 @@ data FileFormat = WOS
-- | Parse file into documents
-- TODO manage errors here
parseDocs :: FileFormat -> FilePath -> IO [HyperdataDocument]
parseDocs format path = do
docs <- snd <$> parse format path
mapM (toDoc format) docs
parseDocs WOS path = join $ mapM (toDoc WOS) <$> snd <$> parse WOS path
parseDocs CsvHalFormat p = parseHal p
type Year = Int
type Year = Int
type Month = Int
type Day = Int
......@@ -102,11 +102,11 @@ parseDate' l (Just txt) = do
utcTime <- parseDate l txt
let (UTCTime day _) = utcTime
let (y,m,d) = DT.toGregorian day
pure (Just utcTime, (Just (fromIntegral y),Just m,Just d))
pure (Just utcTime, (Just (fromIntegral y), Just m,Just d))
toDoc :: FileFormat -> [(Text, Text)] -> IO HyperdataDocument
toDoc format d = do
toDoc WOS d = do
let abstract = lookup "abstract" d
let lang = maybe EN identity (join $ detectLang <$> (fmap (DT.take 50) abstract))
......@@ -115,7 +115,7 @@ toDoc format d = do
(utcTime, (pub_year, pub_month, pub_day)) <- parseDate' lang dateToParse
pure $ HyperdataDocument (Just $ DT.pack $ show format)
pure $ HyperdataDocument (Just $ DT.pack $ show WOS)
(lookup "doi" d)
(lookup "URL" d)
Nothing
......@@ -134,7 +134,6 @@ toDoc format d = do
Nothing
(Just $ (DT.pack . show) lang)
parse :: FileFormat -> FilePath -> IO ([ParseError], [[(Text, Text)]])
parse format path = do
files <- case takeExtension path of
......@@ -157,7 +156,7 @@ withParser WOS = wosParser
--withParser XML = xmlParser
--withParser _ = error "[ERROR] Parser not implemented yet"
runParser :: FileFormat -> DB.ByteString
runParser :: FileFormat -> DB.ByteString
-> IO (Either String [[(DB.ByteString, DB.ByteString)]])
runParser format text = pure $ parseOnly (withParser format) text
......@@ -173,4 +172,3 @@ clean txt = DT.map clean' txt
clean' '’' = '\''
clean' c = c
......@@ -25,8 +25,9 @@ import Control.Applicative
import Data.Char (ord)
import Data.Csv
import Data.Either (Either(Left, Right))
import Data.Text (Text, pack, length, intercalate)
import Data.Text (Text, pack, length, intercalate, unpack)
import qualified Data.ByteString.Lazy as BL
import Data.Time.Segment (jour)
import Data.Vector (Vector)
import qualified Data.Vector as V
......@@ -194,8 +195,6 @@ readHal fp = do
Left e -> panic (pack e)
Right csvDocs -> pure csvDocs
------------------------------------------------------------------------
writeCsv :: FilePath -> (Header, Vector CsvDoc) -> IO ()
writeCsv fp (h, vs) = BL.writeFile fp $
encodeByNameWith csvEncodeOptions h (V.toList vs)
......@@ -206,7 +205,7 @@ writeCsv fp (h, vs) = BL.writeFile fp $
data CsvHal = CsvHal
{ csvHal_title :: !Text
, csvHal_source :: !Text
, csvHal_publication_year :: !Int
, csvHal_publication_year :: !Integer
, csvHal_publication_month :: !Int
, csvHal_publication_day :: !Int
, csvHal_abstract :: !Text
......@@ -257,9 +256,11 @@ instance ToNamedRecord CsvHal where
toNamedRecord (CsvHal t s py pm pd abst aut url isbn iss jour lang doi auth inst dept lab team doct) =
namedRecord [ "title" .= t
, "source" .= s
, "publication_year" .= py
, "publication_month" .= pm
, "publication_day" .= pd
, "abstract" .= abst
, "authors" .= aut
......@@ -278,3 +279,35 @@ instance ToNamedRecord CsvHal where
, "rteamStructId_i" .= team
, "docType_s" .= doct
]
csvHal2doc :: CsvHal -> HyperdataDocument
csvHal2doc (CsvHal title source
pub_year pub_month pub_day
abstract authors
url _ _ _ _
doi _ _ _ _
_ _ ) = HyperdataDocument (Just "CsvHal")
(Just doi)
(Just url)
Nothing
Nothing
Nothing
(Just title)
(Just authors)
(Just source)
(Just abstract)
(Just $ pack . show $ jour pub_year pub_month pub_day)
(Just $ fromIntegral pub_year)
(Just pub_month)
(Just pub_day)
Nothing
Nothing
Nothing
Nothing
------------------------------------------------------------------------
parseHal :: FilePath -> IO [HyperdataDocument]
parseHal fp = map csvHal2doc <$> V.toList <$> snd <$> readHal fp
------------------------------------------------------------------------
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment