Commit 09e9fa50 authored by Przemyslaw Kaminski's avatar Przemyslaw Kaminski

[csv] more work on making the parser accept looser input

parent d63df339
Pipeline #1697 passed with stage
in 35 minutes and 28 seconds
......@@ -32,7 +32,6 @@ import Data.Time.Calendar (fromGregorian, diffGregorianDurationClip, cdMonths, d
import qualified Data.ByteString.Char8 as C8
import qualified Data.ByteString.Lazy as Lazy
import qualified Data.Vector as Vector
import qualified Gargantext.Core.Text.Corpus.Parsers.CSV as Csv
import qualified Data.Text as T
import Gargantext.Prelude
......@@ -40,6 +39,7 @@ import Gargantext.Database.Admin.Types.Hyperdata (HyperdataDocument(..))
import Gargantext.Core.Text.Context (TermList)
import Gargantext.Core.Text.Corpus.Parsers.CSV (csv_title, csv_abstract, csv_publication_year, csv_publication_month, csv_publication_day,
csv'_source, csv'_title, csv'_abstract, csv'_publication_year, csv'_publication_month, csv'_publication_day, csv'_weight)
import qualified Gargantext.Core.Text.Corpus.Parsers.CSV as Csv
import Gargantext.Core.Text.Corpus.Parsers (FileFormat(..),parseFile)
import Gargantext.Core.Text.List.Formats.CSV (csvMapTermList)
import Gargantext.Core.Text.Terms.WithList (Patterns, buildPatterns, extractTermsWithList)
......@@ -152,8 +152,8 @@ csvToDocs parser patterns time path =
Right r ->
pure $ Vector.toList
$ Vector.take limit
$ Vector.map (\row -> Document (toPhyloDate (csv_publication_year row) (csv_publication_month row) (csv_publication_day row) time)
(toPhyloDate' (csv_publication_year row) (csv_publication_month row) (csv_publication_day row))
$ Vector.map (\row -> Document (toPhyloDate (Csv.unIntOrDec $ csv_publication_year row) (csv_publication_month row) (csv_publication_day row) time)
(toPhyloDate' (Csv.unIntOrDec $ csv_publication_year row) (csv_publication_month row) (csv_publication_day row))
(termsInText patterns $ (csv_title row) <> " " <> (csv_abstract row))
Nothing
[]
......
......@@ -42,7 +42,7 @@ import Gargantext.Core.Types
import Gargantext.Core.Text.Terms
import Gargantext.Core.Text.Context
import Gargantext.Core.Text.Terms.WithList
import Gargantext.Core.Text.Corpus.Parsers.CSV (readFile, csv_title, csv_abstract, csv_publication_year)
import Gargantext.Core.Text.Corpus.Parsers.CSV (readFile, csv_title, csv_abstract, csv_publication_year, unIntOrDec)
import Gargantext.Core.Text.List.Formats.CSV (csvMapTermList)
import Gargantext.Core.Text.Terms (terms)
import Gargantext.Core.Text.Metrics.Count (coocOnContexts, Coocs)
......@@ -91,7 +91,7 @@ main = do
Right cf -> do
let corpus = DM.fromListWith (<>)
. DV.toList
. DV.map (\n -> (csv_publication_year n, [(csv_title n) <> " " <> (csv_abstract n)]))
. DV.map (\n -> (unIntOrDec $ csv_publication_year n, [(csv_title n) <> " " <> (csv_abstract n)]))
. snd $ cf
-- termListMap :: [Text]
......
......@@ -85,7 +85,7 @@ toDoc (CsvGargV3 did dt _ dpy dpm dpd dab dau) =
-- | Types Conversions
toDocs :: Vector CsvDoc -> [CsvGargV3]
toDocs v = V.toList
$ V.zipWith (\nId (CsvDoc t s py pm pd abst auth)
$ V.zipWith (\nId (CsvDoc t s (IntOrDec py) pm pd abst auth)
-> CsvGargV3 nId t s py pm pd abst auth )
(V.enumFromN 1 (V.length v'')) v''
where
......@@ -96,7 +96,7 @@ toDocs v = V.toList
fromDocs :: Vector CsvGargV3 -> Vector CsvDoc
fromDocs docs = V.map fromDocs' docs
where
fromDocs' (CsvGargV3 _ t s py pm pd abst auth) = (CsvDoc t s py pm pd abst auth)
fromDocs' (CsvGargV3 _ t s py pm pd abst auth) = (CsvDoc t s (IntOrDec py) pm pd abst auth)
---------------------------------------------------------------
-- | Split a document in its context
......@@ -139,10 +139,21 @@ docsSize csvDoc = mean ls
---------------------------------------------------------------
newtype IntOrDec = IntOrDec Int
deriving (Show, Eq, Read)
unIntOrDec :: IntOrDec -> Int
unIntOrDec (IntOrDec i) = i
instance FromField IntOrDec where
parseField s = case runParser (parseField s :: Parser Int) of
Left _err -> IntOrDec <$> Prelude.floor <$> (parseField s :: Parser Double)
Right n -> pure $ IntOrDec n
instance ToField IntOrDec where
toField (IntOrDec i) = toField i
data CsvDoc = CsvDoc
{ csv_title :: !Text
, csv_source :: !Text
, csv_publication_year :: !Int
, csv_publication_year :: !IntOrDec
, csv_publication_month :: !Int
, csv_publication_day :: !Int
, csv_abstract :: !Text
......@@ -151,13 +162,13 @@ data CsvDoc = CsvDoc
deriving (Show)
instance FromNamedRecord CsvDoc where
parseNamedRecord r = CsvDoc <$> r .: "title"
<*> r .: "source"
<*> r .: "publication_year"
<*> r .: "publication_month"
<*> r .: "publication_day"
<*> r .: "abstract"
<*> r .: "authors"
parseNamedRecord r = CsvDoc <$> (r .: "title" <|> r .: "Title")
<*> (r .: "source" <|> r .: "Source")
<*> (r .: "publication_year" <|> r .: "Publication Year")
<*> (r .: "publication_month" <|> r .: "Publication Month")
<*> (r .: "publication_day" <|> r .: "Publication Day")
<*> (r .: "abstract" <|> r .: "Abstract")
<*> (r .: "authors" <|> r .: "Authors")
instance ToNamedRecord CsvDoc where
toNamedRecord (CsvDoc t s py pm pd abst aut) =
......@@ -173,7 +184,7 @@ instance ToNamedRecord CsvDoc where
hyperdataDocument2csvDoc :: HyperdataDocument -> CsvDoc
hyperdataDocument2csvDoc h = CsvDoc (m $ _hd_title h)
(m $ _hd_source h)
(mI $ _hd_publication_year h)
(IntOrDec $ mI $ _hd_publication_year h)
(mI $ _hd_publication_month h)
(mI $ _hd_publication_day h)
(m $ _hd_abstract h)
......@@ -357,7 +368,7 @@ csvHal2doc (CsvHal title source
csv2doc :: CsvDoc -> HyperdataDocument
csv2doc (CsvDoc title source
pub_year pub_month pub_day
(IntOrDec pub_year) pub_month pub_day
abstract authors ) = HyperdataDocument (Just "CsvHal")
Nothing
Nothing
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment