Commit 401c86e5 authored by Alexandre Delanoë's avatar Alexandre Delanoë

[TEXT][Parser] Hal CSV parser.

parent 95cd84b4
...@@ -42,20 +42,20 @@ import Gargantext.Database.NodeNgramsNgrams (NodeNgramsNgramsPoly(..), insertNod ...@@ -42,20 +42,20 @@ import Gargantext.Database.NodeNgramsNgrams (NodeNgramsNgramsPoly(..), insertNod
import Gargantext.Database.Types.Node (HyperdataDocument(..)) import Gargantext.Database.Types.Node (HyperdataDocument(..))
import Gargantext.Database.User (getUser, UserLight(..), Username) import Gargantext.Database.User (getUser, UserLight(..), Username)
import Gargantext.Prelude import Gargantext.Prelude
import Gargantext.Text.Parsers (parseDocs, FileFormat(WOS)) import Gargantext.Text.Parsers (parseDocs, FileFormat)
type UserId = Int type UserId = Int
type RootId = Int type RootId = Int
type CorpusId = Int type CorpusId = Int
flowDatabase :: FilePath -> CorpusName -> IO [Int] flowDatabase :: FileFormat -> FilePath -> CorpusName -> IO [Int]
flowDatabase fp cName = do flowDatabase ff fp cName = do
-- Corus Flow -- Corus Flow
(masterUserId, _, corpusId) <- subFlow "gargantua" "Big Corpus" (masterUserId, _, corpusId) <- subFlow "gargantua" "Big Corpus"
-- Documents Flow -- Documents Flow
hyperdataDocuments <- map addUniqIds <$> parseDocs WOS fp hyperdataDocuments <- map addUniqIds <$> parseDocs ff fp
ids <- runCmd' $ insertDocuments masterUserId corpusId hyperdataDocuments ids <- runCmd' $ insertDocuments masterUserId corpusId hyperdataDocuments
printDebug "Docs IDs : " (length ids) printDebug "Docs IDs : " (length ids)
idsRepeat <- runCmd' $ insertDocuments masterUserId corpusId hyperdataDocuments idsRepeat <- runCmd' $ insertDocuments masterUserId corpusId hyperdataDocuments
......
...@@ -56,6 +56,7 @@ import Gargantext.Prelude ...@@ -56,6 +56,7 @@ import Gargantext.Prelude
import Gargantext.Database.Types.Node (HyperdataDocument(..)) import Gargantext.Database.Types.Node (HyperdataDocument(..))
import Gargantext.Text.Parsers.WOS (wosParser) import Gargantext.Text.Parsers.WOS (wosParser)
import Gargantext.Text.Parsers.Date (parseDate) import Gargantext.Text.Parsers.Date (parseDate)
import Gargantext.Text.Parsers.CSV (parseHal)
import Gargantext.Text.Terms.Stop (detectLang) import Gargantext.Text.Terms.Stop (detectLang)
------------------------------------------------------------------------ ------------------------------------------------------------------------
...@@ -70,7 +71,7 @@ type ParseError = String ...@@ -70,7 +71,7 @@ type ParseError = String
-- | According to the format of Input file, -- | According to the format of Input file,
-- different parser are available. -- different parser are available.
data FileFormat = WOS data FileFormat = WOS | CsvHalFormat -- | CsvGargV3
deriving (Show) deriving (Show)
-- Implemented (ISI Format) -- Implemented (ISI Format)
...@@ -86,11 +87,10 @@ data FileFormat = WOS ...@@ -86,11 +87,10 @@ data FileFormat = WOS
-- | Parse file into documents -- | Parse file into documents
-- TODO manage errors here -- TODO manage errors here
parseDocs :: FileFormat -> FilePath -> IO [HyperdataDocument] parseDocs :: FileFormat -> FilePath -> IO [HyperdataDocument]
parseDocs format path = do parseDocs WOS path = join $ mapM (toDoc WOS) <$> snd <$> parse WOS path
docs <- snd <$> parse format path parseDocs CsvHalFormat p = parseHal p
mapM (toDoc format) docs
type Year = Int type Year = Int
type Month = Int type Month = Int
type Day = Int type Day = Int
...@@ -102,11 +102,11 @@ parseDate' l (Just txt) = do ...@@ -102,11 +102,11 @@ parseDate' l (Just txt) = do
utcTime <- parseDate l txt utcTime <- parseDate l txt
let (UTCTime day _) = utcTime let (UTCTime day _) = utcTime
let (y,m,d) = DT.toGregorian day let (y,m,d) = DT.toGregorian day
pure (Just utcTime, (Just (fromIntegral y),Just m,Just d)) pure (Just utcTime, (Just (fromIntegral y), Just m,Just d))
toDoc :: FileFormat -> [(Text, Text)] -> IO HyperdataDocument toDoc :: FileFormat -> [(Text, Text)] -> IO HyperdataDocument
toDoc format d = do toDoc WOS d = do
let abstract = lookup "abstract" d let abstract = lookup "abstract" d
let lang = maybe EN identity (join $ detectLang <$> (fmap (DT.take 50) abstract)) let lang = maybe EN identity (join $ detectLang <$> (fmap (DT.take 50) abstract))
...@@ -115,7 +115,7 @@ toDoc format d = do ...@@ -115,7 +115,7 @@ toDoc format d = do
(utcTime, (pub_year, pub_month, pub_day)) <- parseDate' lang dateToParse (utcTime, (pub_year, pub_month, pub_day)) <- parseDate' lang dateToParse
pure $ HyperdataDocument (Just $ DT.pack $ show format) pure $ HyperdataDocument (Just $ DT.pack $ show WOS)
(lookup "doi" d) (lookup "doi" d)
(lookup "URL" d) (lookup "URL" d)
Nothing Nothing
...@@ -134,7 +134,6 @@ toDoc format d = do ...@@ -134,7 +134,6 @@ toDoc format d = do
Nothing Nothing
(Just $ (DT.pack . show) lang) (Just $ (DT.pack . show) lang)
parse :: FileFormat -> FilePath -> IO ([ParseError], [[(Text, Text)]]) parse :: FileFormat -> FilePath -> IO ([ParseError], [[(Text, Text)]])
parse format path = do parse format path = do
files <- case takeExtension path of files <- case takeExtension path of
...@@ -157,7 +156,7 @@ withParser WOS = wosParser ...@@ -157,7 +156,7 @@ withParser WOS = wosParser
--withParser XML = xmlParser --withParser XML = xmlParser
--withParser _ = error "[ERROR] Parser not implemented yet" --withParser _ = error "[ERROR] Parser not implemented yet"
runParser :: FileFormat -> DB.ByteString runParser :: FileFormat -> DB.ByteString
-> IO (Either String [[(DB.ByteString, DB.ByteString)]]) -> IO (Either String [[(DB.ByteString, DB.ByteString)]])
runParser format text = pure $ parseOnly (withParser format) text runParser format text = pure $ parseOnly (withParser format) text
...@@ -173,4 +172,3 @@ clean txt = DT.map clean' txt ...@@ -173,4 +172,3 @@ clean txt = DT.map clean' txt
clean' '’' = '\'' clean' '’' = '\''
clean' c = c clean' c = c
...@@ -25,8 +25,9 @@ import Control.Applicative ...@@ -25,8 +25,9 @@ import Control.Applicative
import Data.Char (ord) import Data.Char (ord)
import Data.Csv import Data.Csv
import Data.Either (Either(Left, Right)) import Data.Either (Either(Left, Right))
import Data.Text (Text, pack, length, intercalate) import Data.Text (Text, pack, length, intercalate, unpack)
import qualified Data.ByteString.Lazy as BL import qualified Data.ByteString.Lazy as BL
import Data.Time.Segment (jour)
import Data.Vector (Vector) import Data.Vector (Vector)
import qualified Data.Vector as V import qualified Data.Vector as V
...@@ -194,8 +195,6 @@ readHal fp = do ...@@ -194,8 +195,6 @@ readHal fp = do
Left e -> panic (pack e) Left e -> panic (pack e)
Right csvDocs -> pure csvDocs Right csvDocs -> pure csvDocs
------------------------------------------------------------------------ ------------------------------------------------------------------------
writeCsv :: FilePath -> (Header, Vector CsvDoc) -> IO () writeCsv :: FilePath -> (Header, Vector CsvDoc) -> IO ()
writeCsv fp (h, vs) = BL.writeFile fp $ writeCsv fp (h, vs) = BL.writeFile fp $
encodeByNameWith csvEncodeOptions h (V.toList vs) encodeByNameWith csvEncodeOptions h (V.toList vs)
...@@ -206,7 +205,7 @@ writeCsv fp (h, vs) = BL.writeFile fp $ ...@@ -206,7 +205,7 @@ writeCsv fp (h, vs) = BL.writeFile fp $
data CsvHal = CsvHal data CsvHal = CsvHal
{ csvHal_title :: !Text { csvHal_title :: !Text
, csvHal_source :: !Text , csvHal_source :: !Text
, csvHal_publication_year :: !Int , csvHal_publication_year :: !Integer
, csvHal_publication_month :: !Int , csvHal_publication_month :: !Int
, csvHal_publication_day :: !Int , csvHal_publication_day :: !Int
, csvHal_abstract :: !Text , csvHal_abstract :: !Text
...@@ -257,9 +256,11 @@ instance ToNamedRecord CsvHal where ...@@ -257,9 +256,11 @@ instance ToNamedRecord CsvHal where
toNamedRecord (CsvHal t s py pm pd abst aut url isbn iss jour lang doi auth inst dept lab team doct) = toNamedRecord (CsvHal t s py pm pd abst aut url isbn iss jour lang doi auth inst dept lab team doct) =
namedRecord [ "title" .= t namedRecord [ "title" .= t
, "source" .= s , "source" .= s
, "publication_year" .= py , "publication_year" .= py
, "publication_month" .= pm , "publication_month" .= pm
, "publication_day" .= pd , "publication_day" .= pd
, "abstract" .= abst , "abstract" .= abst
, "authors" .= aut , "authors" .= aut
...@@ -278,3 +279,35 @@ instance ToNamedRecord CsvHal where ...@@ -278,3 +279,35 @@ instance ToNamedRecord CsvHal where
, "rteamStructId_i" .= team , "rteamStructId_i" .= team
, "docType_s" .= doct , "docType_s" .= doct
] ]
csvHal2doc :: CsvHal -> HyperdataDocument
csvHal2doc (CsvHal title source
pub_year pub_month pub_day
abstract authors
url _ _ _ _
doi _ _ _ _
_ _ ) = HyperdataDocument (Just "CsvHal")
(Just doi)
(Just url)
Nothing
Nothing
Nothing
(Just title)
(Just authors)
(Just source)
(Just abstract)
(Just $ pack . show $ jour pub_year pub_month pub_day)
(Just $ fromIntegral pub_year)
(Just pub_month)
(Just pub_day)
Nothing
Nothing
Nothing
Nothing
------------------------------------------------------------------------
parseHal :: FilePath -> IO [HyperdataDocument]
parseHal fp = map csvHal2doc <$> V.toList <$> snd <$> readHal fp
------------------------------------------------------------------------
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment