Commit 6cb3efe5 authored by Przemyslaw Kaminski's avatar Przemyslaw Kaminski

[CsvDoc] implement Maybe for year/month/date

parent 5bee1178
Pipeline #1739 passed with stage
in 36 minutes and 21 seconds
...@@ -21,6 +21,7 @@ import Crypto.Hash.SHA256 (hash) ...@@ -21,6 +21,7 @@ import Crypto.Hash.SHA256 (hash)
import Data.Aeson import Data.Aeson
import Data.Either (Either(..)) import Data.Either (Either(..))
import Data.List (concat, nub, isSuffixOf) import Data.List (concat, nub, isSuffixOf)
import Data.Maybe (fromMaybe)
import Data.String (String) import Data.String (String)
import GHC.IO (FilePath) import GHC.IO (FilePath)
import qualified Prelude as Prelude import qualified Prelude as Prelude
...@@ -152,8 +153,13 @@ csvToDocs parser patterns time path = ...@@ -152,8 +153,13 @@ csvToDocs parser patterns time path =
Right r -> Right r ->
pure $ Vector.toList pure $ Vector.toList
$ Vector.take limit $ Vector.take limit
$ Vector.map (\row -> Document (toPhyloDate (Csv.unIntOrDec $ csv_publication_year row) (csv_publication_month row) (csv_publication_day row) time) $ Vector.map (\row -> Document (toPhyloDate (Csv.fromMIntOrDec Csv.defaultYear $ csv_publication_year row)
(toPhyloDate' (Csv.unIntOrDec $ csv_publication_year row) (csv_publication_month row) (csv_publication_day row)) (fromMaybe Csv.defaultMonth $ csv_publication_month row)
(fromMaybe Csv.defaultDay $ csv_publication_day row)
time)
(toPhyloDate' (Csv.fromMIntOrDec Csv.defaultYear $ csv_publication_year row)
(fromMaybe Csv.defaultMonth $ csv_publication_month row)
(fromMaybe Csv.defaultDay $ csv_publication_day row))
(termsInText patterns $ (csv_title row) <> " " <> (csv_abstract row)) (termsInText patterns $ (csv_title row) <> " " <> (csv_abstract row))
Nothing Nothing
[] []
......
...@@ -42,7 +42,7 @@ import Gargantext.Core.Types ...@@ -42,7 +42,7 @@ import Gargantext.Core.Types
import Gargantext.Core.Text.Terms import Gargantext.Core.Text.Terms
import Gargantext.Core.Text.Context import Gargantext.Core.Text.Context
import Gargantext.Core.Text.Terms.WithList import Gargantext.Core.Text.Terms.WithList
import Gargantext.Core.Text.Corpus.Parsers.CSV (readFile, csv_title, csv_abstract, csv_publication_year, unIntOrDec) import Gargantext.Core.Text.Corpus.Parsers.CSV (readFile, csv_title, csv_abstract, csv_publication_year, unIntOrDec, fromMIntOrDec, defaultYear)
import Gargantext.Core.Text.List.Formats.CSV (csvMapTermList) import Gargantext.Core.Text.List.Formats.CSV (csvMapTermList)
import Gargantext.Core.Text.Terms (terms) import Gargantext.Core.Text.Terms (terms)
import Gargantext.Core.Text.Metrics.Count (coocOnContexts, Coocs) import Gargantext.Core.Text.Metrics.Count (coocOnContexts, Coocs)
...@@ -91,7 +91,7 @@ main = do ...@@ -91,7 +91,7 @@ main = do
Right cf -> do Right cf -> do
let corpus = DM.fromListWith (<>) let corpus = DM.fromListWith (<>)
. DV.toList . DV.toList
. DV.map (\n -> (unIntOrDec $ csv_publication_year n, [(csv_title n) <> " " <> (csv_abstract n)])) . DV.map (\n -> (fromMIntOrDec defaultYear $ csv_publication_year n, [(csv_title n) <> " " <> (csv_abstract n)]))
. snd $ cf . snd $ cf
-- termListMap :: [Text] -- termListMap :: [Text]
......
...@@ -283,7 +283,7 @@ csvPost l m = do ...@@ -283,7 +283,7 @@ csvPost l m = do
let lst = readCsvText m let lst = readCsvText m
let p = parseCsvData lst let p = parseCsvData lst
--printDebug "[csvPost] lst" lst --printDebug "[csvPost] lst" lst
--printDebug "[csvPost] p" p printDebug "[csvPost] p" p
_ <- setListNgrams l NgramsTerms p _ <- setListNgrams l NgramsTerms p
pure True pure True
------------------------------------------------------------------------ ------------------------------------------------------------------------
......
...@@ -20,6 +20,7 @@ import qualified Data.ByteString.Lazy as BL ...@@ -20,6 +20,7 @@ import qualified Data.ByteString.Lazy as BL
import Data.Char (ord) import Data.Char (ord)
import Data.Csv import Data.Csv
import Data.Either (Either(..)) import Data.Either (Either(..))
import Data.Maybe (fromMaybe)
import Data.Text (Text, pack, length, intercalate) import Data.Text (Text, pack, length, intercalate)
import Data.Time.Segment (jour) import Data.Time.Segment (jour)
import qualified Data.Vector as V import qualified Data.Vector as V
...@@ -85,8 +86,10 @@ toDoc (CsvGargV3 did dt _ dpy dpm dpd dab dau) = ...@@ -85,8 +86,10 @@ toDoc (CsvGargV3 did dt _ dpy dpm dpd dab dau) =
-- | Types Conversions -- | Types Conversions
toDocs :: Vector CsvDoc -> [CsvGargV3] toDocs :: Vector CsvDoc -> [CsvGargV3]
toDocs v = V.toList toDocs v = V.toList
$ V.zipWith (\nId (CsvDoc t s (IntOrDec py) pm pd abst auth) $ V.zipWith (\nId (CsvDoc t s mPy pm pd abst auth)
-> CsvGargV3 nId t s py pm pd abst auth ) -> CsvGargV3 nId t s
(fromMIntOrDec defaultYear mPy) (fromMaybe defaultMonth pm) (fromMaybe defaultDay pd)
abst auth )
(V.enumFromN 1 (V.length v'')) v'' (V.enumFromN 1 (V.length v'')) v''
where where
v'' = V.foldl (\v' sep -> V.concatMap (splitDoc (docsSize v') sep) v') v seps v'' = V.foldl (\v' sep -> V.concatMap (splitDoc (docsSize v') sep) v') v seps
...@@ -96,7 +99,7 @@ toDocs v = V.toList ...@@ -96,7 +99,7 @@ toDocs v = V.toList
fromDocs :: Vector CsvGargV3 -> Vector CsvDoc fromDocs :: Vector CsvGargV3 -> Vector CsvDoc
fromDocs docs = V.map fromDocs' docs fromDocs docs = V.map fromDocs' docs
where where
fromDocs' (CsvGargV3 _ t s py pm pd abst auth) = (CsvDoc t s (IntOrDec py) pm pd abst auth) fromDocs' (CsvGargV3 _ t s py pm pd abst auth) = (CsvDoc t s (Just $ IntOrDec py) (Just pm) (Just pd) abst auth)
--------------------------------------------------------------- ---------------------------------------------------------------
-- | Split a document in its context -- | Split a document in its context
...@@ -150,12 +153,21 @@ instance FromField IntOrDec where ...@@ -150,12 +153,21 @@ instance FromField IntOrDec where
instance ToField IntOrDec where instance ToField IntOrDec where
toField (IntOrDec i) = toField i toField (IntOrDec i) = toField i
fromMIntOrDec :: Int -> Maybe IntOrDec -> Int
fromMIntOrDec default' mVal = unIntOrDec $ fromMaybe (IntOrDec default') mVal
defaultYear :: Int
defaultYear = 1973
defaultMonth :: Int
defaultMonth = 1
defaultDay :: Int
defaultDay = 1
data CsvDoc = CsvDoc data CsvDoc = CsvDoc
{ csv_title :: !Text { csv_title :: !Text
, csv_source :: !Text , csv_source :: !Text
, csv_publication_year :: !IntOrDec , csv_publication_year :: !(Maybe IntOrDec)
, csv_publication_month :: !Int , csv_publication_month :: !(Maybe Int)
, csv_publication_day :: !Int , csv_publication_day :: !(Maybe Int)
, csv_abstract :: !Text , csv_abstract :: !Text
, csv_authors :: !Text , csv_authors :: !Text
} }
...@@ -172,21 +184,21 @@ instance FromNamedRecord CsvDoc where ...@@ -172,21 +184,21 @@ instance FromNamedRecord CsvDoc where
instance ToNamedRecord CsvDoc where instance ToNamedRecord CsvDoc where
toNamedRecord (CsvDoc t s py pm pd abst aut) = toNamedRecord (CsvDoc t s py pm pd abst aut) =
namedRecord [ "title" .= t namedRecord [ "title" .= t
, "source" .= s , "source" .= s
, "publication_year" .= py , "publication_year" .= py
, "publication_month" .= pm , "publication_month" .= pm
, "publication_day" .= pd , "publication_day" .= pd
, "abstract" .= abst , "abstract" .= abst
, "authors" .= aut , "authors" .= aut
] ]
hyperdataDocument2csvDoc :: HyperdataDocument -> CsvDoc hyperdataDocument2csvDoc :: HyperdataDocument -> CsvDoc
hyperdataDocument2csvDoc h = CsvDoc (m $ _hd_title h) hyperdataDocument2csvDoc h = CsvDoc (m $ _hd_title h)
(m $ _hd_source h) (m $ _hd_source h)
(IntOrDec $ mI $ _hd_publication_year h) (Just $ IntOrDec $ mI $ _hd_publication_year h)
(mI $ _hd_publication_month h) (Just $ mI $ _hd_publication_month h)
(mI $ _hd_publication_day h) (Just $ mI $ _hd_publication_day h)
(m $ _hd_abstract h) (m $ _hd_abstract h)
(m $ _hd_authors h) (m $ _hd_authors h)
...@@ -368,7 +380,7 @@ csvHal2doc (CsvHal title source ...@@ -368,7 +380,7 @@ csvHal2doc (CsvHal title source
csv2doc :: CsvDoc -> HyperdataDocument csv2doc :: CsvDoc -> HyperdataDocument
csv2doc (CsvDoc title source csv2doc (CsvDoc title source
(IntOrDec pub_year) pub_month pub_day mPubYear mPubMonth mPubDay
abstract authors ) = HyperdataDocument (Just "CsvHal") abstract authors ) = HyperdataDocument (Just "CsvHal")
Nothing Nothing
Nothing Nothing
...@@ -380,14 +392,18 @@ csv2doc (CsvDoc title source ...@@ -380,14 +392,18 @@ csv2doc (CsvDoc title source
Nothing Nothing
(Just source) (Just source)
(Just abstract) (Just abstract)
(Just $ pack . show $ jour (fromIntegral pub_year) pub_month pub_day) (Just $ pack . show $ jour (fromIntegral pubYear) pubMonth pubDay)
(Just $ fromIntegral pub_year) (Just pubYear)
(Just pub_month) (Just pubMonth)
(Just pub_day) (Just pubDay)
Nothing Nothing
Nothing Nothing
Nothing Nothing
Nothing Nothing
where
pubYear = fromMIntOrDec defaultYear mPubYear
pubMonth = fromMaybe defaultMonth mPubMonth
pubDay = fromMaybe defaultDay mPubDay
------------------------------------------------------------------------ ------------------------------------------------------------------------
parseHal :: FilePath -> IO (Either Prelude.String [HyperdataDocument]) parseHal :: FilePath -> IO (Either Prelude.String [HyperdataDocument])
......
...@@ -28,7 +28,7 @@ import System.IO (FilePath) ...@@ -28,7 +28,7 @@ import System.IO (FilePath)
import Gargantext.Core.Text.Corpus.Parsers.CSV (CsvDoc(..), writeFile, headerCsvGargV3) import Gargantext.Core.Text.Corpus.Parsers.CSV (CsvDoc(..), writeFile, headerCsvGargV3)
import Data.Vector (fromList) import Data.Vector (fromList)
data Patent = Patent { _patent_title :: Text data Patent = Patent { _patent_title :: Text
, _patent_abstract :: Text , _patent_abstract :: Text
, _patent_year :: Text , _patent_year :: Text
, _patent_id :: Text , _patent_id :: Text
...@@ -49,7 +49,7 @@ json2csv fin fout = do ...@@ -49,7 +49,7 @@ json2csv fin fout = do
patent2csvDoc :: Patent -> CsvDoc patent2csvDoc :: Patent -> CsvDoc
patent2csvDoc (Patent title abstract year _) = patent2csvDoc (Patent title abstract year _) =
CsvDoc title "Source" (read (unpack year)) 1 1 abstract "Authors" CsvDoc title "Source" (Just $ read (unpack year)) (Just 1) (Just 1) abstract "Authors"
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment