[refactoring] more record syntax refactoring

parent 37a36aba
......@@ -31,23 +31,23 @@ get la q ml = do
toDoc' :: Lang -> HAL.Corpus -> IO HyperdataDocument
toDoc' la (HAL.Corpus i t ab d s aus affs struct_id) = do
(utctime, (pub_year, pub_month, pub_day)) <- Date.dateSplit la (maybe (Just "2019") Just d)
pure $ HyperdataDocument (Just "Hal")
(Just $ pack $ show i)
Nothing
Nothing
Nothing
Nothing
(Just $ intercalate " " t)
(Just $ foldl (\x y -> x <> ", " <> y) "" aus)
(Just $ foldl (\x y -> x <> ", " <> y) "" $ affs <> map (cs . show) struct_id)
(Just $ maybe "Nothing" identity s)
(Just $ intercalate " " ab)
(fmap (pack . show) utctime)
pub_year
pub_month
pub_day
Nothing
Nothing
Nothing
(Just $ (pack . show) la)
pure $ HyperdataDocument { _hd_bdd = Just "Hal"
, _hd_doi = Just $ pack $ show i
, _hd_url = Nothing
, _hd_uniqId = Nothing
, _hd_uniqIdBdd = Nothing
, _hd_page = Nothing
, _hd_title = Just $ intercalate " " t
, _hd_authors = Just $ foldl (\x y -> x <> ", " <> y) "" aus
, _hd_institutes = Just $ foldl (\x y -> x <> ", " <> y) "" $ affs <> map (cs . show) struct_id
, _hd_source = Just $ maybe "Nothing" identity s
, _hd_abstract = Just $ intercalate " " ab
, _hd_publication_date = fmap (pack . show) utctime
, _hd_publication_year = pub_year
, _hd_publication_month = pub_month
, _hd_publication_day = pub_day
, _hd_publication_hour = Nothing
, _hd_publication_minute = Nothing
, _hd_publication_second = Nothing
, _hd_language_iso2 = Just $ (pack . show) la }
......@@ -39,22 +39,23 @@ toDoc' la docs' = do
toDoc :: Lang -> ISTEX.Document -> IO HyperdataDocument
toDoc la (ISTEX.Document i t a ab d s) = do
(utctime, (pub_year, pub_month, pub_day)) <- Date.dateSplit la (maybe (Just "2019") (Just . pack . show) d)
pure $ HyperdataDocument (Just "Istex")
(Just i)
Nothing
Nothing
Nothing
Nothing
t
(Just $ foldl (\x y -> x <> ", " <> y) "" (map ISTEX._author_name a))
(Just $ foldl (\x y -> x <> ", " <> y) "" (concat $ (map ISTEX._author_affiliations) a))
(Just $ foldl (\x y -> x <> ", " <> y) "" (catMaybes $ map ISTEX._source_title s))
ab
(fmap (pack . show) utctime)
pub_year
pub_month
pub_day
Nothing
Nothing
Nothing
(Just $ (pack . show) la)
pure $ HyperdataDocument { _hd_bdd = Just "Istex"
, _hd_doi = Just i
, _hd_url = Nothing
, _hd_uniqId = Nothing
, _hd_uniqIdBdd = Nothing
, _hd_page = Nothing
, _hd_title = t
, _hd_authors = Just $ foldl (\x y -> x <> ", " <> y) "" (map ISTEX._author_name a)
, _hd_institutes = Just $ foldl (\x y -> x <> ", " <> y) "" (concat $ (map ISTEX._author_affiliations) a)
, _hd_source = Just $ foldl (\x y -> x <> ", " <> y) "" (catMaybes $ map ISTEX._source_title s)
, _hd_abstract = ab
, _hd_publication_date = fmap (pack . show) utctime
, _hd_publication_year = pub_year
, _hd_publication_month = pub_month
, _hd_publication_day = pub_day
, _hd_publication_hour = Nothing
, _hd_publication_minute = Nothing
, _hd_publication_second = Nothing
, _hd_language_iso2 = Just $ (pack . show) la }
......@@ -38,25 +38,25 @@ get q l = either (\e -> panic $ "CRAWL: PubMed" <> e) (map (toDoc EN))
toDoc :: Lang -> PubMedDoc.PubMed -> HyperdataDocument
toDoc l (PubMedDoc.PubMed (PubMedDoc.PubMedArticle t j as aus)
(PubMedDoc.PubMedDate a y m d)
) = HyperdataDocument (Just "PubMed")
Nothing
Nothing
Nothing
Nothing
Nothing
t
(authors aus)
(institutes aus)
j
(abstract as)
(Just $ Text.pack $ show a)
(Just $ fromIntegral y)
(Just m)
(Just d)
Nothing
Nothing
Nothing
(Just $ (Text.pack . show) l)
) = HyperdataDocument { _hd_bdd = Just "PubMed"
, _hd_doi = Nothing
, _hd_url = Nothing
, _hd_uniqId = Nothing
, _hd_uniqIdBdd = Nothing
, _hd_page = Nothing
, _hd_title = t
, _hd_authors = authors aus
, _hd_institutes = institutes aus
, _hd_source = j
, _hd_abstract = abstract as
, _hd_publication_date = Just $ Text.pack $ show a
, _hd_publication_year = Just $ fromIntegral y
, _hd_publication_month = Just m
, _hd_publication_day = Just d
, _hd_publication_hour = Nothing
, _hd_publication_minute = Nothing
, _hd_publication_second = Nothing
, _hd_language_iso2 = Just $ (Text.pack . show) l }
where
authors :: Maybe [PubMedDoc.Author] -> Maybe Text
authors aus' = case aus' of
......
......@@ -122,25 +122,25 @@ toDoc ff d = do
(utcTime, (pub_year, pub_month, pub_day)) <- Date.dateSplit lang dateToParse
pure $ HyperdataDocument (Just $ DT.pack $ show ff)
(lookup "doi" d)
(lookup "URL" d)
Nothing
Nothing
Nothing
(lookup "title" d)
Nothing
(lookup "authors" d)
(lookup "source" d)
(lookup "abstract" d)
(fmap (DT.pack . show) utcTime)
(pub_year)
(pub_month)
(pub_day)
Nothing
Nothing
Nothing
(Just $ (DT.pack . show) lang)
pure $ HyperdataDocument { _hd_bdd = Just $ DT.pack $ show ff
, _hd_doi = lookup "doi" d
, _hd_url = lookup "URL" d
, _hd_uniqId = Nothing
, _hd_uniqIdBdd = Nothing
, _hd_page = Nothing
, _hd_title = lookup "title" d
, _hd_authors = Nothing
, _hd_institutes = lookup "authors" d
, _hd_source = lookup "source" d
, _hd_abstract = lookup "abstract" d
, _hd_publication_date = fmap (DT.pack . show) utcTime
, _hd_publication_year = pub_year
, _hd_publication_month = pub_month
, _hd_publication_day = pub_day
, _hd_publication_hour = Nothing
, _hd_publication_minute = Nothing
, _hd_publication_second = Nothing
, _hd_language_iso2 = Just $ (DT.pack . show) lang }
enrichWith :: FileFormat
-> (a, [[[(DB.ByteString, DB.ByteString)]]]) -> (a, [[(Text, Text)]])
......
......@@ -75,18 +75,26 @@ instance ToJSON GrandDebatReference
instance ToHyperdataDocument GrandDebatReference
where
toHyperdataDocument (GrandDebatReference id' _ref title'
_createdAt' publishedAt' _updatedAt
_trashed _trashedStatus
_authorId authorType' authorZipCode'
responses') =
HyperdataDocument (Just "GrandDebat") id'
Nothing Nothing Nothing Nothing
title' authorType' authorType' authorZipCode'
(toAbstract <$> responses')
publishedAt'
Nothing Nothing Nothing Nothing Nothing Nothing
(Just $ Text.pack $ show FR)
toHyperdataDocument (GrandDebatReference { id, title, publishedAt, authorType, authorZipCode, responses }) =
HyperdataDocument { _hd_bdd = Just "GrandDebat"
, _hd_doi = id
, _hd_url = Nothing
, _hd_uniqId = Nothing
, _hd_uniqIdBdd = Nothing
, _hd_page = Nothing
, _hd_title = title
, _hd_authors = authorType
, _hd_institutes = authorType
, _hd_source = authorZipCode
, _hd_abstract = toAbstract <$> responses
, _hd_publication_date = publishedAt
, _hd_publication_year = Nothing
, _hd_publication_month = Nothing
, _hd_publication_day = Nothing
, _hd_publication_hour = Nothing
, _hd_publication_minute = Nothing
, _hd_publication_second = Nothing
, _hd_language_iso2 = Just $ Text.pack $ show FR }
where
toAbstract = (Text.intercalate " . ") . ((filter (/= "")) . (map toSentence))
toSentence (GrandDebatResponse _id _qtitle _qvalue r) = case r of
......
......@@ -119,17 +119,24 @@ unbound _ _ = Nothing
bind2doc :: Lang -> [BindingValue] -> HyperdataDocument
bind2doc l [ link, date, langDoc, authors, _source, publisher, title, abstract ] =
HyperdataDocument (Just "Isidore")
Nothing
(unbound l link)
Nothing Nothing Nothing
(unbound l title)
(unbound l authors)
Nothing
(unbound l publisher)
(unbound l abstract)
(unbound l date)
Nothing Nothing Nothing Nothing Nothing Nothing
(unbound l langDoc)
HyperdataDocument { _hd_bdd = Just "Isidore"
, _hd_doi = Nothing
, _hd_url = unbound l link
, _hd_uniqId = Nothing
, _hd_uniqIdBdd = Nothing
, _hd_page = Nothing
, _hd_title = unbound l title
, _hd_authors = unbound l authors
, _hd_institutes = Nothing
, _hd_source = unbound l publisher
, _hd_abstract = unbound l abstract
, _hd_publication_date = unbound l date
, _hd_publication_year = Nothing
, _hd_publication_month = Nothing
, _hd_publication_day = Nothing
, _hd_publication_hour = Nothing
, _hd_publication_minute = Nothing
, _hd_publication_second = Nothing
, _hd_language_iso2 = unbound l langDoc }
bind2doc _ _ = undefined
......@@ -48,8 +48,14 @@ json2csv fin fout = do
writeFile fout (headerCsvGargV3, fromList $ map patent2csvDoc patents)
patent2csvDoc :: Patent -> CsvDoc
patent2csvDoc (Patent title abstract year _) =
CsvDoc title "Source" (Just $ read (unpack year)) (Just 1) (Just 1) abstract "Authors"
patent2csvDoc (Patent { .. }) =
CsvDoc { csv_title = _patent_title
, csv_source = "Source"
, csv_publication_year = Just $ read (unpack _patent_year)
, csv_publication_month = Just 1
, csv_publication_day = Just 1
, csv_abstract = _patent_abstract
, csv_authors = "Authors" }
......
......@@ -70,3 +70,5 @@ onField :: ByteString -> (ByteString -> [(ByteString, ByteString)])
-> [(ByteString, ByteString)] -> [(ByteString, ByteString)]
onField k f m = m <> ( maybe [] f (lookup k m) )
......@@ -68,5 +68,3 @@ fixFields ns = map (first fixFields'') ns
| champs == "UR" = "url"
| champs == "N2" = abstract
| otherwise = champs
......@@ -95,7 +95,9 @@ parsePage =
revision <-
parseRevision
many_ $ ignoreAnyTreeContent
return $ Page Mediawiki title revision
return $ Page { _markupFormat = Mediawiki
, _title = title
, _text = revision }
parseMediawiki :: MonadThrow m => ConduitT Event Page m (Maybe ())
parseMediawiki =
......@@ -108,7 +110,7 @@ mediawikiPageToPlain :: Page -> IO Page
mediawikiPageToPlain page = do
title <- mediaToPlain $ _title page
revision <- mediaToPlain $ _text page
return $ Page Plaintext title revision
return $ Page { _markupFormat = Plaintext, _title = title, _text = revision }
where mediaToPlain media =
case media of
(Nothing) -> return Nothing
......
......@@ -86,17 +86,17 @@ buildNgramsLists user uCid mCid mfslw gp = do
data MapListSize = MapListSize { unMapListSize :: !Int }
buildNgramsOthersList ::( HasNodeError err
, CmdM env err m
, HasNodeStory env err m
, HasTreeError err
)
=> User
-> UserCorpusId
-> Maybe FlowSocialListWith
-> GroupParams
-> (NgramsType, MapListSize)
-> m (Map NgramsType [NgramsElement])
buildNgramsOthersList :: ( HasNodeError err
, CmdM env err m
, HasNodeStory env err m
, HasTreeError err
)
=> User
-> UserCorpusId
-> Maybe FlowSocialListWith
-> GroupParams
-> (NgramsType, MapListSize)
-> m (Map NgramsType [NgramsElement])
buildNgramsOthersList user uCid mfslw _groupParams (nt, MapListSize mapListSize) = do
allTerms :: HashMap NgramsTerm (Set NodeId) <- getNodesByNgramsUser uCid nt
......@@ -106,7 +106,7 @@ buildNgramsOthersList user uCid mfslw _groupParams (nt, MapListSize mapListSize)
$ HashMap.fromList
$ List.zip (HashMap.keys allTerms)
(List.cycle [mempty])
)
)
let
groupedWithList = toGroupedTree {- groupParams -} socialLists allTerms
......@@ -148,13 +148,13 @@ buildNgramsTermsList :: ( HasNodeError err
, HasNodeStory env err m
, HasTreeError err
)
=> User
-> UserCorpusId
-> MasterCorpusId
-> Maybe FlowSocialListWith
-> GroupParams
-> (NgramsType, MapListSize)
-> m (Map NgramsType [NgramsElement])
=> User
-> UserCorpusId
-> MasterCorpusId
-> Maybe FlowSocialListWith
-> GroupParams
-> (NgramsType, MapListSize)
-> m (Map NgramsType [NgramsElement])
buildNgramsTermsList user uCid mCid mfslw groupParams (nt, _mapListSize)= do
-- Filter 0 With Double
......@@ -170,7 +170,7 @@ buildNgramsTermsList user uCid mCid mfslw groupParams (nt, _mapListSize)= do
$ HashMap.fromList
$ List.zip (HashMap.keys allTerms)
(List.cycle [mempty])
)
)
printDebug "[buldNgramsTermsList: Flow Social List / end]" nt
let ngramsKeys = HashMap.keysSet allTerms
......
......@@ -72,7 +72,7 @@ groupWith :: GroupParams
-> NgramsTerm
-> NgramsTerm
groupWith GroupIdentity t = identity t
groupWith (GroupParams l _m _n _) t =
groupWith (GroupParams { unGroupParams_lang = l }) t =
NgramsTerm
$ Text.intercalate " "
$ map (stem l)
......@@ -86,7 +86,7 @@ groupWith (GroupParams l _m _n _) t =
$ unNgramsTerm t
-- | This lemmatization group done with CoreNLP algo (or others)
groupWith (GroupWithPosTag _ _ m) t =
groupWith (GroupWithPosTag { _gwl_map = m }) t =
case HashMap.lookup (unNgramsTerm t) m of
Nothing -> clean t
Just t' -> clean $ NgramsTerm t'
......
......@@ -82,11 +82,11 @@ makeLenses ''TermType
--extractTerms :: Traversable t => TermType Lang -> t Text -> IO (t [Terms])
extractTerms :: TermType Lang -> [Text] -> IO [[Terms]]
extractTerms (Unsupervised l n s m) xs = mapM (terms (Unsupervised l n s (Just m'))) xs
extractTerms (Unsupervised {..}) xs = mapM (terms (Unsupervised { _tt_model = Just m', .. })) xs
where
m' = case m of
m' = case _tt_model of
Just m''-> m''
Nothing -> newTries n (Text.intercalate " " xs)
Nothing -> newTries _tt_windowSize (Text.intercalate " " xs)
extractTerms termTypeLang xs = mapM (terms termTypeLang) xs
......@@ -96,15 +96,16 @@ withLang :: (Foldable t, Functor t, HasText h)
=> TermType Lang
-> t h
-> TermType Lang
withLang (Unsupervised l n s m) ns = Unsupervised l n s m'
withLang (Unsupervised {..}) ns = Unsupervised { _tt_model = m', .. }
where
m' = case m of
m' = case _tt_model of
Nothing -> -- trace ("buildTries here" :: String)
Just $ buildTries n $ fmap toToken
$ uniText
$ Text.intercalate " . "
$ List.concat
$ map hasText ns
Just $ buildTries _tt_ngramsSize
$ fmap toToken
$ uniText
$ Text.intercalate " . "
$ List.concat
$ map hasText ns
just_m -> just_m
withLang l _ = l
......@@ -171,9 +172,9 @@ terms :: TermType Lang -> Text -> IO [Terms]
terms (Mono lang) txt = pure $ monoTerms lang txt
terms (Multi lang) txt = multiterms lang txt
terms (MonoMulti lang) txt = terms (Multi lang) txt
terms (Unsupervised lang n s m) txt = termsUnsupervised (Unsupervised lang n s (Just m')) txt
terms (Unsupervised { .. }) txt = termsUnsupervised (Unsupervised { _tt_model = Just m', .. }) txt
where
m' = maybe (newTries n txt) identity m
m' = maybe (newTries _tt_ngramsSize txt) identity _tt_model
-- terms (WithList list) txt = pure . concat $ extractTermsWithList list txt
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment