Commit 09e9fa50 authored by Przemyslaw Kaminski's avatar Przemyslaw Kaminski

[csv] more work on making the parser accept looser input

parent d63df339
...@@ -32,7 +32,6 @@ import Data.Time.Calendar (fromGregorian, diffGregorianDurationClip, cdMonths, d ...@@ -32,7 +32,6 @@ import Data.Time.Calendar (fromGregorian, diffGregorianDurationClip, cdMonths, d
import qualified Data.ByteString.Char8 as C8 import qualified Data.ByteString.Char8 as C8
import qualified Data.ByteString.Lazy as Lazy import qualified Data.ByteString.Lazy as Lazy
import qualified Data.Vector as Vector import qualified Data.Vector as Vector
import qualified Gargantext.Core.Text.Corpus.Parsers.CSV as Csv
import qualified Data.Text as T import qualified Data.Text as T
import Gargantext.Prelude import Gargantext.Prelude
...@@ -40,6 +39,7 @@ import Gargantext.Database.Admin.Types.Hyperdata (HyperdataDocument(..)) ...@@ -40,6 +39,7 @@ import Gargantext.Database.Admin.Types.Hyperdata (HyperdataDocument(..))
import Gargantext.Core.Text.Context (TermList) import Gargantext.Core.Text.Context (TermList)
import Gargantext.Core.Text.Corpus.Parsers.CSV (csv_title, csv_abstract, csv_publication_year, csv_publication_month, csv_publication_day, import Gargantext.Core.Text.Corpus.Parsers.CSV (csv_title, csv_abstract, csv_publication_year, csv_publication_month, csv_publication_day,
csv'_source, csv'_title, csv'_abstract, csv'_publication_year, csv'_publication_month, csv'_publication_day, csv'_weight) csv'_source, csv'_title, csv'_abstract, csv'_publication_year, csv'_publication_month, csv'_publication_day, csv'_weight)
import qualified Gargantext.Core.Text.Corpus.Parsers.CSV as Csv
import Gargantext.Core.Text.Corpus.Parsers (FileFormat(..),parseFile) import Gargantext.Core.Text.Corpus.Parsers (FileFormat(..),parseFile)
import Gargantext.Core.Text.List.Formats.CSV (csvMapTermList) import Gargantext.Core.Text.List.Formats.CSV (csvMapTermList)
import Gargantext.Core.Text.Terms.WithList (Patterns, buildPatterns, extractTermsWithList) import Gargantext.Core.Text.Terms.WithList (Patterns, buildPatterns, extractTermsWithList)
...@@ -152,8 +152,8 @@ csvToDocs parser patterns time path = ...@@ -152,8 +152,8 @@ csvToDocs parser patterns time path =
Right r -> Right r ->
pure $ Vector.toList pure $ Vector.toList
$ Vector.take limit $ Vector.take limit
$ Vector.map (\row -> Document (toPhyloDate (csv_publication_year row) (csv_publication_month row) (csv_publication_day row) time) $ Vector.map (\row -> Document (toPhyloDate (Csv.unIntOrDec $ csv_publication_year row) (csv_publication_month row) (csv_publication_day row) time)
(toPhyloDate' (csv_publication_year row) (csv_publication_month row) (csv_publication_day row)) (toPhyloDate' (Csv.unIntOrDec $ csv_publication_year row) (csv_publication_month row) (csv_publication_day row))
(termsInText patterns $ (csv_title row) <> " " <> (csv_abstract row)) (termsInText patterns $ (csv_title row) <> " " <> (csv_abstract row))
Nothing Nothing
[] []
......
...@@ -42,7 +42,7 @@ import Gargantext.Core.Types ...@@ -42,7 +42,7 @@ import Gargantext.Core.Types
import Gargantext.Core.Text.Terms import Gargantext.Core.Text.Terms
import Gargantext.Core.Text.Context import Gargantext.Core.Text.Context
import Gargantext.Core.Text.Terms.WithList import Gargantext.Core.Text.Terms.WithList
import Gargantext.Core.Text.Corpus.Parsers.CSV (readFile, csv_title, csv_abstract, csv_publication_year) import Gargantext.Core.Text.Corpus.Parsers.CSV (readFile, csv_title, csv_abstract, csv_publication_year, unIntOrDec)
import Gargantext.Core.Text.List.Formats.CSV (csvMapTermList) import Gargantext.Core.Text.List.Formats.CSV (csvMapTermList)
import Gargantext.Core.Text.Terms (terms) import Gargantext.Core.Text.Terms (terms)
import Gargantext.Core.Text.Metrics.Count (coocOnContexts, Coocs) import Gargantext.Core.Text.Metrics.Count (coocOnContexts, Coocs)
...@@ -91,7 +91,7 @@ main = do ...@@ -91,7 +91,7 @@ main = do
Right cf -> do Right cf -> do
let corpus = DM.fromListWith (<>) let corpus = DM.fromListWith (<>)
. DV.toList . DV.toList
. DV.map (\n -> (csv_publication_year n, [(csv_title n) <> " " <> (csv_abstract n)])) . DV.map (\n -> (unIntOrDec $ csv_publication_year n, [(csv_title n) <> " " <> (csv_abstract n)]))
. snd $ cf . snd $ cf
-- termListMap :: [Text] -- termListMap :: [Text]
......
...@@ -85,7 +85,7 @@ toDoc (CsvGargV3 did dt _ dpy dpm dpd dab dau) = ...@@ -85,7 +85,7 @@ toDoc (CsvGargV3 did dt _ dpy dpm dpd dab dau) =
-- | Types Conversions -- | Types Conversions
toDocs :: Vector CsvDoc -> [CsvGargV3] toDocs :: Vector CsvDoc -> [CsvGargV3]
toDocs v = V.toList toDocs v = V.toList
$ V.zipWith (\nId (CsvDoc t s py pm pd abst auth) $ V.zipWith (\nId (CsvDoc t s (IntOrDec py) pm pd abst auth)
-> CsvGargV3 nId t s py pm pd abst auth ) -> CsvGargV3 nId t s py pm pd abst auth )
(V.enumFromN 1 (V.length v'')) v'' (V.enumFromN 1 (V.length v'')) v''
where where
...@@ -96,7 +96,7 @@ toDocs v = V.toList ...@@ -96,7 +96,7 @@ toDocs v = V.toList
fromDocs :: Vector CsvGargV3 -> Vector CsvDoc fromDocs :: Vector CsvGargV3 -> Vector CsvDoc
fromDocs docs = V.map fromDocs' docs fromDocs docs = V.map fromDocs' docs
where where
fromDocs' (CsvGargV3 _ t s py pm pd abst auth) = (CsvDoc t s py pm pd abst auth) fromDocs' (CsvGargV3 _ t s py pm pd abst auth) = (CsvDoc t s (IntOrDec py) pm pd abst auth)
--------------------------------------------------------------- ---------------------------------------------------------------
-- | Split a document in its context -- | Split a document in its context
...@@ -139,10 +139,21 @@ docsSize csvDoc = mean ls ...@@ -139,10 +139,21 @@ docsSize csvDoc = mean ls
--------------------------------------------------------------- ---------------------------------------------------------------
newtype IntOrDec = IntOrDec Int
deriving (Show, Eq, Read)
unIntOrDec :: IntOrDec -> Int
unIntOrDec (IntOrDec i) = i
instance FromField IntOrDec where
parseField s = case runParser (parseField s :: Parser Int) of
Left _err -> IntOrDec <$> Prelude.floor <$> (parseField s :: Parser Double)
Right n -> pure $ IntOrDec n
instance ToField IntOrDec where
toField (IntOrDec i) = toField i
data CsvDoc = CsvDoc data CsvDoc = CsvDoc
{ csv_title :: !Text { csv_title :: !Text
, csv_source :: !Text , csv_source :: !Text
, csv_publication_year :: !Int , csv_publication_year :: !IntOrDec
, csv_publication_month :: !Int , csv_publication_month :: !Int
, csv_publication_day :: !Int , csv_publication_day :: !Int
, csv_abstract :: !Text , csv_abstract :: !Text
...@@ -151,13 +162,13 @@ data CsvDoc = CsvDoc ...@@ -151,13 +162,13 @@ data CsvDoc = CsvDoc
deriving (Show) deriving (Show)
instance FromNamedRecord CsvDoc where instance FromNamedRecord CsvDoc where
parseNamedRecord r = CsvDoc <$> r .: "title" parseNamedRecord r = CsvDoc <$> (r .: "title" <|> r .: "Title")
<*> r .: "source" <*> (r .: "source" <|> r .: "Source")
<*> r .: "publication_year" <*> (r .: "publication_year" <|> r .: "Publication Year")
<*> r .: "publication_month" <*> (r .: "publication_month" <|> r .: "Publication Month")
<*> r .: "publication_day" <*> (r .: "publication_day" <|> r .: "Publication Day")
<*> r .: "abstract" <*> (r .: "abstract" <|> r .: "Abstract")
<*> r .: "authors" <*> (r .: "authors" <|> r .: "Authors")
instance ToNamedRecord CsvDoc where instance ToNamedRecord CsvDoc where
toNamedRecord (CsvDoc t s py pm pd abst aut) = toNamedRecord (CsvDoc t s py pm pd abst aut) =
...@@ -173,7 +184,7 @@ instance ToNamedRecord CsvDoc where ...@@ -173,7 +184,7 @@ instance ToNamedRecord CsvDoc where
hyperdataDocument2csvDoc :: HyperdataDocument -> CsvDoc hyperdataDocument2csvDoc :: HyperdataDocument -> CsvDoc
hyperdataDocument2csvDoc h = CsvDoc (m $ _hd_title h) hyperdataDocument2csvDoc h = CsvDoc (m $ _hd_title h)
(m $ _hd_source h) (m $ _hd_source h)
(mI $ _hd_publication_year h) (IntOrDec $ mI $ _hd_publication_year h)
(mI $ _hd_publication_month h) (mI $ _hd_publication_month h)
(mI $ _hd_publication_day h) (mI $ _hd_publication_day h)
(m $ _hd_abstract h) (m $ _hd_abstract h)
...@@ -357,7 +368,7 @@ csvHal2doc (CsvHal title source ...@@ -357,7 +368,7 @@ csvHal2doc (CsvHal title source
csv2doc :: CsvDoc -> HyperdataDocument csv2doc :: CsvDoc -> HyperdataDocument
csv2doc (CsvDoc title source csv2doc (CsvDoc title source
pub_year pub_month pub_day (IntOrDec pub_year) pub_month pub_day
abstract authors ) = HyperdataDocument (Just "CsvHal") abstract authors ) = HyperdataDocument (Just "CsvHal")
Nothing Nothing
Nothing Nothing
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment