[refactoring] more record syntax refactoring

parent 37a36aba
Pipeline #1802 passed with stage
in 33 minutes and 36 seconds
...@@ -31,23 +31,23 @@ get la q ml = do ...@@ -31,23 +31,23 @@ get la q ml = do
toDoc' :: Lang -> HAL.Corpus -> IO HyperdataDocument toDoc' :: Lang -> HAL.Corpus -> IO HyperdataDocument
toDoc' la (HAL.Corpus i t ab d s aus affs struct_id) = do toDoc' la (HAL.Corpus i t ab d s aus affs struct_id) = do
(utctime, (pub_year, pub_month, pub_day)) <- Date.dateSplit la (maybe (Just "2019") Just d) (utctime, (pub_year, pub_month, pub_day)) <- Date.dateSplit la (maybe (Just "2019") Just d)
pure $ HyperdataDocument (Just "Hal") pure $ HyperdataDocument { _hd_bdd = Just "Hal"
(Just $ pack $ show i) , _hd_doi = Just $ pack $ show i
Nothing , _hd_url = Nothing
Nothing , _hd_uniqId = Nothing
Nothing , _hd_uniqIdBdd = Nothing
Nothing , _hd_page = Nothing
(Just $ intercalate " " t) , _hd_title = Just $ intercalate " " t
(Just $ foldl (\x y -> x <> ", " <> y) "" aus) , _hd_authors = Just $ foldl (\x y -> x <> ", " <> y) "" aus
(Just $ foldl (\x y -> x <> ", " <> y) "" $ affs <> map (cs . show) struct_id) , _hd_institutes = Just $ foldl (\x y -> x <> ", " <> y) "" $ affs <> map (cs . show) struct_id
(Just $ maybe "Nothing" identity s) , _hd_source = Just $ maybe "Nothing" identity s
(Just $ intercalate " " ab) , _hd_abstract = Just $ intercalate " " ab
(fmap (pack . show) utctime) , _hd_publication_date = fmap (pack . show) utctime
pub_year , _hd_publication_year = pub_year
pub_month , _hd_publication_month = pub_month
pub_day , _hd_publication_day = pub_day
Nothing , _hd_publication_hour = Nothing
Nothing , _hd_publication_minute = Nothing
Nothing , _hd_publication_second = Nothing
(Just $ (pack . show) la) , _hd_language_iso2 = Just $ (pack . show) la }
...@@ -39,22 +39,23 @@ toDoc' la docs' = do ...@@ -39,22 +39,23 @@ toDoc' la docs' = do
toDoc :: Lang -> ISTEX.Document -> IO HyperdataDocument toDoc :: Lang -> ISTEX.Document -> IO HyperdataDocument
toDoc la (ISTEX.Document i t a ab d s) = do toDoc la (ISTEX.Document i t a ab d s) = do
(utctime, (pub_year, pub_month, pub_day)) <- Date.dateSplit la (maybe (Just "2019") (Just . pack . show) d) (utctime, (pub_year, pub_month, pub_day)) <- Date.dateSplit la (maybe (Just "2019") (Just . pack . show) d)
pure $ HyperdataDocument (Just "Istex") pure $ HyperdataDocument { _hd_bdd = Just "Istex"
(Just i) , _hd_doi = Just i
Nothing , _hd_url = Nothing
Nothing , _hd_uniqId = Nothing
Nothing , _hd_uniqIdBdd = Nothing
Nothing , _hd_page = Nothing
t , _hd_title = t
(Just $ foldl (\x y -> x <> ", " <> y) "" (map ISTEX._author_name a)) , _hd_authors = Just $ foldl (\x y -> x <> ", " <> y) "" (map ISTEX._author_name a)
(Just $ foldl (\x y -> x <> ", " <> y) "" (concat $ (map ISTEX._author_affiliations) a)) , _hd_institutes = Just $ foldl (\x y -> x <> ", " <> y) "" (concat $ (map ISTEX._author_affiliations) a)
(Just $ foldl (\x y -> x <> ", " <> y) "" (catMaybes $ map ISTEX._source_title s)) , _hd_source = Just $ foldl (\x y -> x <> ", " <> y) "" (catMaybes $ map ISTEX._source_title s)
ab , _hd_abstract = ab
(fmap (pack . show) utctime) , _hd_publication_date = fmap (pack . show) utctime
pub_year , _hd_publication_year = pub_year
pub_month , _hd_publication_month = pub_month
pub_day , _hd_publication_day = pub_day
Nothing , _hd_publication_hour = Nothing
Nothing , _hd_publication_minute = Nothing
Nothing , _hd_publication_second = Nothing
(Just $ (pack . show) la) , _hd_language_iso2 = Just $ (pack . show) la }
...@@ -38,25 +38,25 @@ get q l = either (\e -> panic $ "CRAWL: PubMed" <> e) (map (toDoc EN)) ...@@ -38,25 +38,25 @@ get q l = either (\e -> panic $ "CRAWL: PubMed" <> e) (map (toDoc EN))
toDoc :: Lang -> PubMedDoc.PubMed -> HyperdataDocument toDoc :: Lang -> PubMedDoc.PubMed -> HyperdataDocument
toDoc l (PubMedDoc.PubMed (PubMedDoc.PubMedArticle t j as aus) toDoc l (PubMedDoc.PubMed (PubMedDoc.PubMedArticle t j as aus)
(PubMedDoc.PubMedDate a y m d) (PubMedDoc.PubMedDate a y m d)
) = HyperdataDocument (Just "PubMed") ) = HyperdataDocument { _hd_bdd = Just "PubMed"
Nothing , _hd_doi = Nothing
Nothing , _hd_url = Nothing
Nothing , _hd_uniqId = Nothing
Nothing , _hd_uniqIdBdd = Nothing
Nothing , _hd_page = Nothing
t , _hd_title = t
(authors aus) , _hd_authors = authors aus
(institutes aus) , _hd_institutes = institutes aus
j , _hd_source = j
(abstract as) , _hd_abstract = abstract as
(Just $ Text.pack $ show a) , _hd_publication_date = Just $ Text.pack $ show a
(Just $ fromIntegral y) , _hd_publication_year = Just $ fromIntegral y
(Just m) , _hd_publication_month = Just m
(Just d) , _hd_publication_day = Just d
Nothing , _hd_publication_hour = Nothing
Nothing , _hd_publication_minute = Nothing
Nothing , _hd_publication_second = Nothing
(Just $ (Text.pack . show) l) , _hd_language_iso2 = Just $ (Text.pack . show) l }
where where
authors :: Maybe [PubMedDoc.Author] -> Maybe Text authors :: Maybe [PubMedDoc.Author] -> Maybe Text
authors aus' = case aus' of authors aus' = case aus' of
......
...@@ -122,25 +122,25 @@ toDoc ff d = do ...@@ -122,25 +122,25 @@ toDoc ff d = do
(utcTime, (pub_year, pub_month, pub_day)) <- Date.dateSplit lang dateToParse (utcTime, (pub_year, pub_month, pub_day)) <- Date.dateSplit lang dateToParse
pure $ HyperdataDocument (Just $ DT.pack $ show ff) pure $ HyperdataDocument { _hd_bdd = Just $ DT.pack $ show ff
(lookup "doi" d) , _hd_doi = lookup "doi" d
(lookup "URL" d) , _hd_url = lookup "URL" d
Nothing , _hd_uniqId = Nothing
Nothing , _hd_uniqIdBdd = Nothing
Nothing , _hd_page = Nothing
(lookup "title" d) , _hd_title = lookup "title" d
Nothing , _hd_authors = Nothing
(lookup "authors" d) , _hd_institutes = lookup "authors" d
(lookup "source" d) , _hd_source = lookup "source" d
(lookup "abstract" d) , _hd_abstract = lookup "abstract" d
(fmap (DT.pack . show) utcTime) , _hd_publication_date = fmap (DT.pack . show) utcTime
(pub_year) , _hd_publication_year = pub_year
(pub_month) , _hd_publication_month = pub_month
(pub_day) , _hd_publication_day = pub_day
Nothing , _hd_publication_hour = Nothing
Nothing , _hd_publication_minute = Nothing
Nothing , _hd_publication_second = Nothing
(Just $ (DT.pack . show) lang) , _hd_language_iso2 = Just $ (DT.pack . show) lang }
enrichWith :: FileFormat enrichWith :: FileFormat
-> (a, [[[(DB.ByteString, DB.ByteString)]]]) -> (a, [[(Text, Text)]]) -> (a, [[[(DB.ByteString, DB.ByteString)]]]) -> (a, [[(Text, Text)]])
......
...@@ -62,34 +62,39 @@ data CsvGargV3 = CsvGargV3 ...@@ -62,34 +62,39 @@ data CsvGargV3 = CsvGargV3
-- | Doc 2 HyperdataDocument -- | Doc 2 HyperdataDocument
toDoc :: CsvGargV3 -> HyperdataDocument toDoc :: CsvGargV3 -> HyperdataDocument
toDoc (CsvGargV3 did dt _ dpy dpm dpd dab dau) = toDoc (CsvGargV3 did dt _ dpy dpm dpd dab dau) =
HyperdataDocument (Just "CSV") HyperdataDocument { _hd_bdd = Just "CSV"
(Just . pack . show $ did) , _hd_doi = Just . pack . show $ did
Nothing , _hd_url = Nothing
Nothing , _hd_uniqId = Nothing
Nothing , _hd_uniqIdBdd = Nothing
Nothing , _hd_page = Nothing
(Just dt) , _hd_title = Just dt
Nothing , _hd_authors = Nothing
(Just dau) , _hd_institutes = Just dau
(Just dab) , _hd_source = Just dab
(Nothing) , _hd_abstract = Nothing
Nothing , _hd_publication_date = Nothing
(Just dpy) , _hd_publication_year = Just dpy
(Just dpm) , _hd_publication_month = Just dpm
(Just dpd) , _hd_publication_day = Just dpd
Nothing , _hd_publication_hour = Nothing
Nothing , _hd_publication_minute = Nothing
Nothing , _hd_publication_second = Nothing
Nothing , _hd_language_iso2 = Nothing }
--------------------------------------------------------------- ---------------------------------------------------------------
-- | Types Conversions -- | Types Conversions
toDocs :: Vector CsvDoc -> [CsvGargV3] toDocs :: Vector CsvDoc -> [CsvGargV3]
toDocs v = V.toList toDocs v = V.toList
$ V.zipWith (\nId (CsvDoc t s mPy pm pd abst auth) $ V.zipWith (\nId (CsvDoc { .. }) -- (CsvDoc t s mPy pm pd abst auth)
-> CsvGargV3 nId t s -> CsvGargV3 { d_docId = nId
(fromMIntOrDec defaultYear mPy) (fromMaybe defaultMonth pm) (fromMaybe defaultDay pd) , d_title = csv_title
abst auth ) , d_source = csv_source
, d_publication_year = fromMIntOrDec defaultYear csv_publication_year
, d_publication_month = fromMaybe defaultMonth csv_publication_month
, d_publication_day = fromMaybe defaultDay csv_publication_day
, d_abstract = csv_abstract
, d_authors = csv_authors })
(V.enumFromN 1 (V.length v'')) v'' (V.enumFromN 1 (V.length v'')) v''
where where
v'' = V.foldl (\v' sep -> V.concatMap (splitDoc (docsSize v') sep) v') v seps v'' = V.foldl (\v' sep -> V.concatMap (splitDoc (docsSize v') sep) v') v seps
...@@ -99,7 +104,13 @@ toDocs v = V.toList ...@@ -99,7 +104,13 @@ toDocs v = V.toList
fromDocs :: Vector CsvGargV3 -> Vector CsvDoc fromDocs :: Vector CsvGargV3 -> Vector CsvDoc
fromDocs docs = V.map fromDocs' docs fromDocs docs = V.map fromDocs' docs
where where
fromDocs' (CsvGargV3 _ t s py pm pd abst auth) = (CsvDoc t s (Just $ IntOrDec py) (Just pm) (Just pd) abst auth) fromDocs' (CsvGargV3 { .. }) = CsvDoc { csv_title = d_title
, csv_source = d_source
, csv_publication_year = Just $ IntOrDec d_publication_year
, csv_publication_month = Just d_publication_month
, csv_publication_day = Just d_publication_day
, csv_abstract = d_abstract
, csv_authors = d_authors }
--------------------------------------------------------------- ---------------------------------------------------------------
-- | Split a document in its context -- | Split a document in its context
...@@ -117,19 +128,17 @@ splitDoc m splt doc = let docSize = (length $ csv_abstract doc) in ...@@ -117,19 +128,17 @@ splitDoc m splt doc = let docSize = (length $ csv_abstract doc) in
V.fromList [doc] V.fromList [doc]
where where
splitDoc' :: SplitContext -> CsvDoc -> Vector CsvDoc splitDoc' :: SplitContext -> CsvDoc -> Vector CsvDoc
splitDoc' contextSize (CsvDoc t s py pm pd abst auth) = V.fromList $ [firstDoc] <> nextDocs splitDoc' contextSize (CsvDoc { .. }) = V.fromList $ [firstDoc] <> nextDocs
where where
firstDoc = CsvDoc t s py pm pd firstAbstract auth firstDoc = CsvDoc { csv_abstract = firstAbstract, .. }
firstAbstract = head' "splitDoc'1" abstracts firstAbstract = head' "splitDoc'1" abstracts
nextDocs = map (\txt -> CsvDoc nextDocs = map (\txt -> CsvDoc { csv_title = head' "splitDoc'2" $ sentences txt
(head' "splitDoc'2" $ sentences txt) , csv_abstract = unsentences $ tail' "splitDoc'1" $ sentences txt
s py pm pd , .. }
(unsentences $ tail' "splitDoc'1" $ sentences txt)
auth
) (tail' "splitDoc'2" abstracts) ) (tail' "splitDoc'2" abstracts)
abstracts = (splitBy $ contextSize) abst abstracts = (splitBy $ contextSize) csv_abstract
--------------------------------------------------------------- ---------------------------------------------------------------
--------------------------------------------------------------- ---------------------------------------------------------------
...@@ -174,33 +183,35 @@ data CsvDoc = CsvDoc ...@@ -174,33 +183,35 @@ data CsvDoc = CsvDoc
deriving (Show) deriving (Show)
instance FromNamedRecord CsvDoc where instance FromNamedRecord CsvDoc where
parseNamedRecord r = CsvDoc <$> (r .: "title" <|> r .: "Title") parseNamedRecord r = do
<*> (r .: "source" <|> r .: "Source") csv_title <- r .: "title" <|> r .: "Title"
<*> (r .: "publication_year" <|> r .: "Publication Year") csv_source <- r .: "source" <|> r .: "Source"
<*> (r .: "publication_month" <|> r .: "Publication Month") csv_publication_year <- r .: "publication_year" <|> r .: "Publication Year"
<*> (r .: "publication_day" <|> r .: "Publication Day") csv_publication_month <- r .: "publication_month" <|> r .: "Publication Month"
<*> (r .: "abstract" <|> r .: "Abstract") csv_publication_day <- r .: "publication_day" <|> r .: "Publication Day"
<*> (r .: "authors" <|> r .: "Authors") csv_abstract <- r .: "abstract" <|> r .: "Abstract"
csv_authors <- r .: "authors" <|> r .: "Authors"
pure $ CsvDoc { .. }
instance ToNamedRecord CsvDoc where instance ToNamedRecord CsvDoc where
toNamedRecord (CsvDoc t s py pm pd abst aut) = toNamedRecord (CsvDoc{ .. }) =
namedRecord [ "title" .= t namedRecord [ "title" .= csv_title
, "source" .= s , "source" .= csv_source
, "publication_year" .= py , "publication_year" .= csv_publication_year
, "publication_month" .= pm , "publication_month" .= csv_publication_month
, "publication_day" .= pd , "publication_day" .= csv_publication_day
, "abstract" .= abst , "abstract" .= csv_abstract
, "authors" .= aut , "authors" .= csv_authors
] ]
hyperdataDocument2csvDoc :: HyperdataDocument -> CsvDoc hyperdataDocument2csvDoc :: HyperdataDocument -> CsvDoc
hyperdataDocument2csvDoc h = CsvDoc (m $ _hd_title h) hyperdataDocument2csvDoc h = CsvDoc { csv_title = m $ _hd_title h
(m $ _hd_source h) , csv_source = m $ _hd_source h
(Just $ IntOrDec $ mI $ _hd_publication_year h) , csv_publication_year = Just $ IntOrDec $ mI $ _hd_publication_year h
(Just $ mI $ _hd_publication_month h) , csv_publication_month = Just $ mI $ _hd_publication_month h
(Just $ mI $ _hd_publication_day h) , csv_publication_day = Just $ mI $ _hd_publication_day h
(m $ _hd_abstract h) , csv_abstract = m $ _hd_abstract h
(m $ _hd_authors h) , csv_authors = m $ _hd_authors h }
where where
m = maybe "" identity m = maybe "" identity
...@@ -300,110 +311,109 @@ data CsvHal = CsvHal ...@@ -300,110 +311,109 @@ data CsvHal = CsvHal
deriving (Show) deriving (Show)
instance FromNamedRecord CsvHal where instance FromNamedRecord CsvHal where
parseNamedRecord r = CsvHal <$> r .: "title" parseNamedRecord r = do
<*> r .: "source" csvHal_title <- r .: "title"
<*> r .: "publication_year" csvHal_source <- r .: "source"
<*> r .: "publication_month" csvHal_publication_year <- r .: "publication_year"
<*> r .: "publication_day" csvHal_publication_month <- r .: "publication_month"
<*> r .: "abstract" csvHal_publication_day <- r .: "publication_day"
<*> r .: "authors" csvHal_abstract <- r .: "abstract"
csvHal_authors <- r .: "authors"
<*> r .: "url" csvHal_url <- r .: "url"
<*> r .: "isbn_s" csvHal_isbn_s <- r .: "isbn_s"
<*> r .: "issue_s" csvHal_issue_s <- r .: "issue_s"
<*> r .: "journalPublisher_s" csvHal_journalPublisher_s <- r .: "journalPublisher_s"
<*> r .: "language_s" csvHal_language_s <- r .: "language_s"
csvHal_doiId_s <- r .: "doiId_s"
<*> r .: "doiId_s" csvHal_authId_i <- r .: "authId_i"
<*> r .: "authId_i" csvHal_instStructId_i <- r .: "instStructId_i"
<*> r .: "instStructId_i" csvHal_deptStructId_i <- r .: "deptStructId_i"
<*> r .: "deptStructId_i" csvHal_labStructId_i <- r .: "labStructId_i"
<*> r .: "labStructId_i" csvHal_rteamStructId_i <- r .: "rteamStructId_i"
csvHal_docType_s <- r .: "docType_s"
<*> r .: "rteamStructId_i" pure $ CsvHal { .. }
<*> r .: "docType_s"
instance ToNamedRecord CsvHal where instance ToNamedRecord CsvHal where
toNamedRecord (CsvHal t s py pm pd abst aut url isbn iss j lang doi auth inst dept lab team doct) = --toNamedRecord (CsvHal t s py pm pd abst aut url isbn iss j lang doi auth inst dept lab team doct) =
namedRecord [ "title" .= t toNamedRecord (CsvHal { .. }) =
, "source" .= s namedRecord [ "title" .= csvHal_title
, "source" .= csvHal_source
, "publication_year" .= py
, "publication_month" .= pm , "publication_year" .= csvHal_publication_year
, "publication_day" .= pd , "publication_month" .= csvHal_publication_month
, "publication_day" .= csvHal_publication_day
, "abstract" .= abst
, "authors" .= aut , "abstract" .= csvHal_abstract
, "authors" .= csvHal_authors
, "url" .= url
, "isbn_s" .= isbn , "url" .= csvHal_url
, "issue_s" .= iss , "isbn_s" .= csvHal_isbn_s
, "journalPublisher_s" .= j , "issue_s" .= csvHal_issue_s
, "language_s" .= lang , "journalPublisher_s" .= csvHal_journalPublisher_s
, "language_s" .= csvHal_language_s
, "doiId_s" .= doi
, "authId_i" .= auth , "doiId_s" .= csvHal_doiId_s
, "instStructId_i" .= inst , "authId_i" .= csvHal_authId_i
, "deptStructId_i" .= dept , "instStructId_i" .= csvHal_instStructId_i
, "labStructId_i" .= lab , "deptStructId_i" .= csvHal_deptStructId_i
, "labStructId_i" .= csvHal_labStructId_i
, "rteamStructId_i" .= team , "rteamStructId_i" .= csvHal_rteamStructId_i
, "docType_s" .= doct , "docType_s" .= csvHal_docType_s
] ]
csvHal2doc :: CsvHal -> HyperdataDocument csvHal2doc :: CsvHal -> HyperdataDocument
csvHal2doc (CsvHal title source csvHal2doc (CsvHal { .. }) =
pub_year pub_month pub_day HyperdataDocument { _hd_bdd = Just "CsvHal"
abstract authors , _hd_doi = Just csvHal_doiId_s
url _ _ _ _ , _hd_url = Just csvHal_url
doi _ inst _ _ , _hd_uniqId = Nothing
_ _ ) = HyperdataDocument (Just "CsvHal") , _hd_uniqIdBdd = Nothing
(Just doi) , _hd_page = Nothing
(Just url) , _hd_title = Just csvHal_title
Nothing , _hd_authors = Just csvHal_authors
Nothing , _hd_institutes = Just csvHal_instStructId_i
Nothing , _hd_source = Just csvHal_source
(Just title) , _hd_abstract = Just csvHal_abstract
(Just authors) , _hd_publication_date = Just $ pack . show $ jour csvHal_publication_year
(Just inst) csvHal_publication_month
(Just source) csvHal_publication_day
(Just abstract) , _hd_publication_year = Just $ fromIntegral csvHal_publication_year
(Just $ pack . show $ jour pub_year pub_month pub_day) , _hd_publication_month = Just csvHal_publication_month
(Just $ fromIntegral pub_year) , _hd_publication_day = Just csvHal_publication_day
(Just pub_month) , _hd_publication_hour = Nothing
(Just pub_day) , _hd_publication_minute = Nothing
Nothing , _hd_publication_second = Nothing
Nothing , _hd_language_iso2 = Nothing }
Nothing
Nothing
csv2doc :: CsvDoc -> HyperdataDocument csv2doc :: CsvDoc -> HyperdataDocument
csv2doc (CsvDoc title source csv2doc (CsvDoc { .. })
mPubYear mPubMonth mPubDay = HyperdataDocument { _hd_bdd = Just "CsvHal"
abstract authors ) = HyperdataDocument (Just "CsvHal") , _hd_doi = Nothing
Nothing , _hd_url = Nothing
Nothing , _hd_uniqId = Nothing
Nothing , _hd_uniqIdBdd = Nothing
Nothing , _hd_page = Nothing
Nothing , _hd_title = Just csv_title
(Just title) , _hd_authors = Just csv_authors
(Just authors) , _hd_institutes = Nothing
Nothing , _hd_source = Just csv_source
(Just source) , _hd_abstract = Just csv_abstract
(Just abstract) , _hd_publication_date = Just $ pack . show $ jour (fromIntegral pubYear)
(Just $ pack . show $ jour (fromIntegral pubYear) pubMonth pubDay) pubMonth
(Just pubYear) pubDay
(Just pubMonth) , _hd_publication_year = Just pubYear
(Just pubDay) , _hd_publication_month = Just pubMonth
Nothing , _hd_publication_day = Just pubDay
Nothing , _hd_publication_hour = Nothing
Nothing , _hd_publication_minute = Nothing
Nothing , _hd_publication_second = Nothing
, _hd_language_iso2 = Nothing }
where where
pubYear = fromMIntOrDec defaultYear mPubYear pubYear = fromMIntOrDec defaultYear csv_publication_year
pubMonth = fromMaybe defaultMonth mPubMonth pubMonth = fromMaybe defaultMonth csv_publication_month
pubDay = fromMaybe defaultDay mPubDay pubDay = fromMaybe defaultDay csv_publication_day
------------------------------------------------------------------------ ------------------------------------------------------------------------
parseHal :: FilePath -> IO (Either Prelude.String [HyperdataDocument]) parseHal :: FilePath -> IO (Either Prelude.String [HyperdataDocument])
...@@ -438,14 +448,16 @@ data Csv' = Csv' ...@@ -438,14 +448,16 @@ data Csv' = Csv'
instance FromNamedRecord Csv' where instance FromNamedRecord Csv' where
parseNamedRecord r = Csv' <$> r .: "title" parseNamedRecord r = do
<*> r .: "source" csv'_title <- r .: "title"
<*> r .: "publication_year" csv'_source <- r .: "source"
<*> r .: "publication_month" csv'_publication_year <- r .: "publication_year"
<*> r .: "publication_day" csv'_publication_month <- r .: "publication_month"
<*> r .: "abstract" csv'_publication_day <- r .: "publication_day"
<*> r .: "authors" csv'_abstract <- r .: "abstract"
<*> r .: "weight" csv'_authors <- r .: "authors"
csv'_weight <- r .: "weight"
pure $ Csv' { .. }
readWeightedCsv :: FilePath -> IO (Header, Vector Csv') readWeightedCsv :: FilePath -> IO (Header, Vector Csv')
readWeightedCsv fp = readWeightedCsv fp =
......
...@@ -75,18 +75,26 @@ instance ToJSON GrandDebatReference ...@@ -75,18 +75,26 @@ instance ToJSON GrandDebatReference
instance ToHyperdataDocument GrandDebatReference instance ToHyperdataDocument GrandDebatReference
where where
toHyperdataDocument (GrandDebatReference id' _ref title' toHyperdataDocument (GrandDebatReference { id, title, publishedAt, authorType, authorZipCode, responses }) =
_createdAt' publishedAt' _updatedAt HyperdataDocument { _hd_bdd = Just "GrandDebat"
_trashed _trashedStatus , _hd_doi = id
_authorId authorType' authorZipCode' , _hd_url = Nothing
responses') = , _hd_uniqId = Nothing
HyperdataDocument (Just "GrandDebat") id' , _hd_uniqIdBdd = Nothing
Nothing Nothing Nothing Nothing , _hd_page = Nothing
title' authorType' authorType' authorZipCode' , _hd_title = title
(toAbstract <$> responses') , _hd_authors = authorType
publishedAt' , _hd_institutes = authorType
Nothing Nothing Nothing Nothing Nothing Nothing , _hd_source = authorZipCode
(Just $ Text.pack $ show FR) , _hd_abstract = toAbstract <$> responses
, _hd_publication_date = publishedAt
, _hd_publication_year = Nothing
, _hd_publication_month = Nothing
, _hd_publication_day = Nothing
, _hd_publication_hour = Nothing
, _hd_publication_minute = Nothing
, _hd_publication_second = Nothing
, _hd_language_iso2 = Just $ Text.pack $ show FR }
where where
toAbstract = (Text.intercalate " . ") . ((filter (/= "")) . (map toSentence)) toAbstract = (Text.intercalate " . ") . ((filter (/= "")) . (map toSentence))
toSentence (GrandDebatResponse _id _qtitle _qvalue r) = case r of toSentence (GrandDebatResponse _id _qtitle _qvalue r) = case r of
......
...@@ -119,17 +119,24 @@ unbound _ _ = Nothing ...@@ -119,17 +119,24 @@ unbound _ _ = Nothing
bind2doc :: Lang -> [BindingValue] -> HyperdataDocument bind2doc :: Lang -> [BindingValue] -> HyperdataDocument
bind2doc l [ link, date, langDoc, authors, _source, publisher, title, abstract ] = bind2doc l [ link, date, langDoc, authors, _source, publisher, title, abstract ] =
HyperdataDocument (Just "Isidore") HyperdataDocument { _hd_bdd = Just "Isidore"
Nothing , _hd_doi = Nothing
(unbound l link) , _hd_url = unbound l link
Nothing Nothing Nothing , _hd_uniqId = Nothing
(unbound l title) , _hd_uniqIdBdd = Nothing
(unbound l authors) , _hd_page = Nothing
Nothing , _hd_title = unbound l title
(unbound l publisher) , _hd_authors = unbound l authors
(unbound l abstract) , _hd_institutes = Nothing
(unbound l date) , _hd_source = unbound l publisher
Nothing Nothing Nothing Nothing Nothing Nothing , _hd_abstract = unbound l abstract
(unbound l langDoc) , _hd_publication_date = unbound l date
, _hd_publication_year = Nothing
, _hd_publication_month = Nothing
, _hd_publication_day = Nothing
, _hd_publication_hour = Nothing
, _hd_publication_minute = Nothing
, _hd_publication_second = Nothing
, _hd_language_iso2 = unbound l langDoc }
bind2doc _ _ = undefined bind2doc _ _ = undefined
...@@ -48,8 +48,14 @@ json2csv fin fout = do ...@@ -48,8 +48,14 @@ json2csv fin fout = do
writeFile fout (headerCsvGargV3, fromList $ map patent2csvDoc patents) writeFile fout (headerCsvGargV3, fromList $ map patent2csvDoc patents)
patent2csvDoc :: Patent -> CsvDoc patent2csvDoc :: Patent -> CsvDoc
patent2csvDoc (Patent title abstract year _) = patent2csvDoc (Patent { .. }) =
CsvDoc title "Source" (Just $ read (unpack year)) (Just 1) (Just 1) abstract "Authors" CsvDoc { csv_title = _patent_title
, csv_source = "Source"
, csv_publication_year = Just $ read (unpack _patent_year)
, csv_publication_month = Just 1
, csv_publication_day = Just 1
, csv_abstract = _patent_abstract
, csv_authors = "Authors" }
......
...@@ -70,3 +70,5 @@ onField :: ByteString -> (ByteString -> [(ByteString, ByteString)]) ...@@ -70,3 +70,5 @@ onField :: ByteString -> (ByteString -> [(ByteString, ByteString)])
-> [(ByteString, ByteString)] -> [(ByteString, ByteString)] -> [(ByteString, ByteString)] -> [(ByteString, ByteString)]
onField k f m = m <> ( maybe [] f (lookup k m) ) onField k f m = m <> ( maybe [] f (lookup k m) )
...@@ -68,5 +68,3 @@ fixFields ns = map (first fixFields'') ns ...@@ -68,5 +68,3 @@ fixFields ns = map (first fixFields'') ns
| champs == "UR" = "url" | champs == "UR" = "url"
| champs == "N2" = abstract | champs == "N2" = abstract
| otherwise = champs | otherwise = champs
...@@ -95,7 +95,9 @@ parsePage = ...@@ -95,7 +95,9 @@ parsePage =
revision <- revision <-
parseRevision parseRevision
many_ $ ignoreAnyTreeContent many_ $ ignoreAnyTreeContent
return $ Page Mediawiki title revision return $ Page { _markupFormat = Mediawiki
, _title = title
, _text = revision }
parseMediawiki :: MonadThrow m => ConduitT Event Page m (Maybe ()) parseMediawiki :: MonadThrow m => ConduitT Event Page m (Maybe ())
parseMediawiki = parseMediawiki =
...@@ -108,7 +110,7 @@ mediawikiPageToPlain :: Page -> IO Page ...@@ -108,7 +110,7 @@ mediawikiPageToPlain :: Page -> IO Page
mediawikiPageToPlain page = do mediawikiPageToPlain page = do
title <- mediaToPlain $ _title page title <- mediaToPlain $ _title page
revision <- mediaToPlain $ _text page revision <- mediaToPlain $ _text page
return $ Page Plaintext title revision return $ Page { _markupFormat = Plaintext, _title = title, _text = revision }
where mediaToPlain media = where mediaToPlain media =
case media of case media of
(Nothing) -> return Nothing (Nothing) -> return Nothing
......
...@@ -86,17 +86,17 @@ buildNgramsLists user uCid mCid mfslw gp = do ...@@ -86,17 +86,17 @@ buildNgramsLists user uCid mCid mfslw gp = do
data MapListSize = MapListSize { unMapListSize :: !Int } data MapListSize = MapListSize { unMapListSize :: !Int }
buildNgramsOthersList ::( HasNodeError err buildNgramsOthersList :: ( HasNodeError err
, CmdM env err m , CmdM env err m
, HasNodeStory env err m , HasNodeStory env err m
, HasTreeError err , HasTreeError err
) )
=> User => User
-> UserCorpusId -> UserCorpusId
-> Maybe FlowSocialListWith -> Maybe FlowSocialListWith
-> GroupParams -> GroupParams
-> (NgramsType, MapListSize) -> (NgramsType, MapListSize)
-> m (Map NgramsType [NgramsElement]) -> m (Map NgramsType [NgramsElement])
buildNgramsOthersList user uCid mfslw _groupParams (nt, MapListSize mapListSize) = do buildNgramsOthersList user uCid mfslw _groupParams (nt, MapListSize mapListSize) = do
allTerms :: HashMap NgramsTerm (Set NodeId) <- getNodesByNgramsUser uCid nt allTerms :: HashMap NgramsTerm (Set NodeId) <- getNodesByNgramsUser uCid nt
...@@ -106,7 +106,7 @@ buildNgramsOthersList user uCid mfslw _groupParams (nt, MapListSize mapListSize) ...@@ -106,7 +106,7 @@ buildNgramsOthersList user uCid mfslw _groupParams (nt, MapListSize mapListSize)
$ HashMap.fromList $ HashMap.fromList
$ List.zip (HashMap.keys allTerms) $ List.zip (HashMap.keys allTerms)
(List.cycle [mempty]) (List.cycle [mempty])
) )
let let
groupedWithList = toGroupedTree {- groupParams -} socialLists allTerms groupedWithList = toGroupedTree {- groupParams -} socialLists allTerms
...@@ -148,13 +148,13 @@ buildNgramsTermsList :: ( HasNodeError err ...@@ -148,13 +148,13 @@ buildNgramsTermsList :: ( HasNodeError err
, HasNodeStory env err m , HasNodeStory env err m
, HasTreeError err , HasTreeError err
) )
=> User => User
-> UserCorpusId -> UserCorpusId
-> MasterCorpusId -> MasterCorpusId
-> Maybe FlowSocialListWith -> Maybe FlowSocialListWith
-> GroupParams -> GroupParams
-> (NgramsType, MapListSize) -> (NgramsType, MapListSize)
-> m (Map NgramsType [NgramsElement]) -> m (Map NgramsType [NgramsElement])
buildNgramsTermsList user uCid mCid mfslw groupParams (nt, _mapListSize)= do buildNgramsTermsList user uCid mCid mfslw groupParams (nt, _mapListSize)= do
-- Filter 0 With Double -- Filter 0 With Double
...@@ -170,7 +170,7 @@ buildNgramsTermsList user uCid mCid mfslw groupParams (nt, _mapListSize)= do ...@@ -170,7 +170,7 @@ buildNgramsTermsList user uCid mCid mfslw groupParams (nt, _mapListSize)= do
$ HashMap.fromList $ HashMap.fromList
$ List.zip (HashMap.keys allTerms) $ List.zip (HashMap.keys allTerms)
(List.cycle [mempty]) (List.cycle [mempty])
) )
printDebug "[buldNgramsTermsList: Flow Social List / end]" nt printDebug "[buldNgramsTermsList: Flow Social List / end]" nt
let ngramsKeys = HashMap.keysSet allTerms let ngramsKeys = HashMap.keysSet allTerms
......
...@@ -72,7 +72,7 @@ groupWith :: GroupParams ...@@ -72,7 +72,7 @@ groupWith :: GroupParams
-> NgramsTerm -> NgramsTerm
-> NgramsTerm -> NgramsTerm
groupWith GroupIdentity t = identity t groupWith GroupIdentity t = identity t
groupWith (GroupParams l _m _n _) t = groupWith (GroupParams { unGroupParams_lang = l }) t =
NgramsTerm NgramsTerm
$ Text.intercalate " " $ Text.intercalate " "
$ map (stem l) $ map (stem l)
...@@ -86,7 +86,7 @@ groupWith (GroupParams l _m _n _) t = ...@@ -86,7 +86,7 @@ groupWith (GroupParams l _m _n _) t =
$ unNgramsTerm t $ unNgramsTerm t
-- | This lemmatization group done with CoreNLP algo (or others) -- | This lemmatization group done with CoreNLP algo (or others)
groupWith (GroupWithPosTag _ _ m) t = groupWith (GroupWithPosTag { _gwl_map = m }) t =
case HashMap.lookup (unNgramsTerm t) m of case HashMap.lookup (unNgramsTerm t) m of
Nothing -> clean t Nothing -> clean t
Just t' -> clean $ NgramsTerm t' Just t' -> clean $ NgramsTerm t'
......
...@@ -82,11 +82,11 @@ makeLenses ''TermType ...@@ -82,11 +82,11 @@ makeLenses ''TermType
--extractTerms :: Traversable t => TermType Lang -> t Text -> IO (t [Terms]) --extractTerms :: Traversable t => TermType Lang -> t Text -> IO (t [Terms])
extractTerms :: TermType Lang -> [Text] -> IO [[Terms]] extractTerms :: TermType Lang -> [Text] -> IO [[Terms]]
extractTerms (Unsupervised l n s m) xs = mapM (terms (Unsupervised l n s (Just m'))) xs extractTerms (Unsupervised {..}) xs = mapM (terms (Unsupervised { _tt_model = Just m', .. })) xs
where where
m' = case m of m' = case _tt_model of
Just m''-> m'' Just m''-> m''
Nothing -> newTries n (Text.intercalate " " xs) Nothing -> newTries _tt_windowSize (Text.intercalate " " xs)
extractTerms termTypeLang xs = mapM (terms termTypeLang) xs extractTerms termTypeLang xs = mapM (terms termTypeLang) xs
...@@ -96,15 +96,16 @@ withLang :: (Foldable t, Functor t, HasText h) ...@@ -96,15 +96,16 @@ withLang :: (Foldable t, Functor t, HasText h)
=> TermType Lang => TermType Lang
-> t h -> t h
-> TermType Lang -> TermType Lang
withLang (Unsupervised l n s m) ns = Unsupervised l n s m' withLang (Unsupervised {..}) ns = Unsupervised { _tt_model = m', .. }
where where
m' = case m of m' = case _tt_model of
Nothing -> -- trace ("buildTries here" :: String) Nothing -> -- trace ("buildTries here" :: String)
Just $ buildTries n $ fmap toToken Just $ buildTries _tt_ngramsSize
$ uniText $ fmap toToken
$ Text.intercalate " . " $ uniText
$ List.concat $ Text.intercalate " . "
$ map hasText ns $ List.concat
$ map hasText ns
just_m -> just_m just_m -> just_m
withLang l _ = l withLang l _ = l
...@@ -171,9 +172,9 @@ terms :: TermType Lang -> Text -> IO [Terms] ...@@ -171,9 +172,9 @@ terms :: TermType Lang -> Text -> IO [Terms]
terms (Mono lang) txt = pure $ monoTerms lang txt terms (Mono lang) txt = pure $ monoTerms lang txt
terms (Multi lang) txt = multiterms lang txt terms (Multi lang) txt = multiterms lang txt
terms (MonoMulti lang) txt = terms (Multi lang) txt terms (MonoMulti lang) txt = terms (Multi lang) txt
terms (Unsupervised lang n s m) txt = termsUnsupervised (Unsupervised lang n s (Just m')) txt terms (Unsupervised { .. }) txt = termsUnsupervised (Unsupervised { _tt_model = Just m', .. }) txt
where where
m' = maybe (newTries n txt) identity m m' = maybe (newTries _tt_ngramsSize txt) identity _tt_model
-- terms (WithList list) txt = pure . concat $ extractTermsWithList list txt -- terms (WithList list) txt = pure . concat $ extractTermsWithList list txt
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment