Commit 96a23581 authored by mzheng's avatar mzheng

add grouping when creating a corpus from HAL

parent 9144bad9
Pipeline #6552 passed with stages
in 48 minutes and 58 seconds
......@@ -18,7 +18,7 @@ fi
# with the `sha256sum` result calculated on the `cabal.project` and
# `cabal.project.freeze`. This ensures the files stay deterministic so that CI
# cache can kick in.
expected_cabal_project_hash="9c487a789f77d9a96b4ac6a4b6268a075a72b8a391d987ba12194a46d96f6ee8"
expected_cabal_project_hash="500b0d7b6c0b95937096a27c9a21fcaeeb6c0933d6f0db5e2ead9e69fa25b63f"
expected_cabal_project_freeze_hash="50f3ccea242400c48bd9cec7286bd07c8223c87c043e09576dd5fef0949f982a"
......
......@@ -93,7 +93,7 @@ source-repository-package
source-repository-package
type: git
location: https://gitlab.iscpif.fr/gargantext/crawlers/hal.git
tag: 1dbd939257d33126e49d2679375553df1f2eebc5
tag: d54812d52c9d1f86d331a991b3a87c9a8b4379cf
source-repository-package
type: git
......
......@@ -229,4 +229,5 @@ hyperdataDocumentFromSearxResult l (SearxResult { _sr_content, _sr_engine, _sr_p
, _hd_publication_hour = Nothing
, _hd_publication_minute = Nothing
, _hd_publication_second = Nothing
, _hd_language_iso2 = Just $ T.pack $ show l }
, _hd_language_iso2 = Just $ T.pack $ show l
, _hd_institutes_tree = Nothing}
......@@ -85,7 +85,8 @@ documentUpload nId doc = do
, _hd_publication_hour = Nothing
, _hd_publication_minute = Nothing
, _hd_publication_second = Nothing
, _hd_language_iso2 = Just $ view du_language doc }
, _hd_language_iso2 = Just $ view du_language doc
, _hd_institutes_tree = Nothing }
let lang = EN
ncs <- view $ nlpServerGet lang
......
......@@ -156,6 +156,7 @@ hyperdataDocumentFromFrameWrite lang paragraphSize (node, contents) =
, _hd_publication_hour = Nothing
, _hd_publication_minute = Nothing
, _hd_publication_second = Nothing
, _hd_language_iso2 = Just $ T.pack $ show lang }
, _hd_language_iso2 = Just $ T.pack $ show lang
, _hd_institutes_tree = Nothing }
) (text2titleParagraphs paragraphSize ctxts)
)
......@@ -119,7 +119,8 @@ toDoc l (Arxiv.Result { abstract
, _hd_publication_hour = Nothing
, _hd_publication_minute = Nothing
, _hd_publication_second = Nothing
, _hd_language_iso2 = Just $ (Text.pack . show) l }
, _hd_language_iso2 = Just $ (Text.pack . show) l
, _hd_institutes_tree = Nothing }
where
authors :: [Ax.Author] -> Maybe Text
authors [] = Nothing
......
......@@ -61,7 +61,8 @@ toDoc lang (EPO.HyperdataDocument { .. }) =
, _hd_publication_hour = Nothing
, _hd_publication_minute = Nothing
, _hd_publication_second = Nothing
, _hd_language_iso2 = Just $ iso639ToText lang }
, _hd_language_iso2 = Just $ iso639ToText lang
, _hd_institutes_tree = Nothing }
where
authors_ = if null authors
......
......@@ -53,7 +53,7 @@ toDoc' la (HAL.Document { .. }) = do
, _hd_page = Nothing
, _hd_title = Just $ unwords _document_title
, _hd_authors = Just $ foldl' (\x y -> if x == "" then y else x <> ", " <> y) "" _document_authors_names
, _hd_institutes = Just $ foldl' (\x y -> if x == "" then y else x <> ", " <> y) "" $ _document_authors_affiliations <> map show _document_struct_id
, _hd_institutes = Just $ foldl' (\x y -> if x == "" then y else x <> ", " <> y) "" $ zipWith (\affialition structId -> affialition <> " | " <> structId) _document_authors_affiliations $ map show _document_struct_id
, _hd_source = Just $ maybe "Nothing" identity _document_source
, _hd_abstract = Just abstract
, _hd_publication_date = fmap show utctime
......@@ -63,4 +63,5 @@ toDoc' la (HAL.Document { .. }) = do
, _hd_publication_hour = Nothing
, _hd_publication_minute = Nothing
, _hd_publication_second = Nothing
, _hd_language_iso2 = Just $ show la }
, _hd_language_iso2 = Just $ show la
, _hd_institutes_tree = Just _document_institutes_tree }
......@@ -94,4 +94,5 @@ isidoreToDoc lang (IsidoreDoc t a d u s as) = do
, _hd_publication_minute = Nothing
, _hd_publication_second = Nothing
, _hd_language_iso2 = Just . Text.pack . show $ lang
, _hd_institutes_tree = Nothing
}
......@@ -50,7 +50,8 @@ toDoc (OA.Work { .. } ) =
, _hd_publication_hour = Nothing -- TODO
, _hd_publication_minute = Nothing -- TODO
, _hd_publication_second = Nothing -- TODO
, _hd_language_iso2 = language }
, _hd_language_iso2 = language
, _hd_institutes_tree = Nothing }
where
firstPage :: OA.Biblio -> Maybe Int
firstPage OA.Biblio { first_page } = (readMaybe . T.unpack) =<< first_page
......
......@@ -131,7 +131,8 @@ toDoc l (PubMedDoc.PubMed { pubmed_article = PubMedDoc.PubMedArticle t j as aus
, _hd_publication_hour = Nothing
, _hd_publication_minute = Nothing
, _hd_publication_second = Nothing
, _hd_language_iso2 = Just $ (Text.pack . show) l }
, _hd_language_iso2 = Just $ (Text.pack . show) l
, _hd_institutes_tree = Nothing }
where
authors :: [PubMedDoc.Author] -> Maybe Text
authors [] = Nothing
......
......@@ -267,7 +267,8 @@ toDoc ff d = do
, _hd_publication_hour = Nothing
, _hd_publication_minute = Nothing
, _hd_publication_second = Nothing
, _hd_language_iso2 = Just $ (DT.pack . show) lang }
, _hd_language_iso2 = Just $ (DT.pack . show) lang
, _hd_institutes_tree = Nothing }
-- printDebug "[G.C.T.C.Parsers] HyperdataDocument" hd
pure hd
......
......@@ -95,6 +95,7 @@ publiToHyperdata y (Publi a s t txt) =
, _hd_publication_minute = Nothing
, _hd_publication_second = Nothing
, _hd_language_iso2 = Just $ DT.pack $ show FR
, _hd_institutes_tree = Nothing
}
-------------------------------------------------------------
......
......@@ -56,6 +56,7 @@ gitlabIssue2hyperdataDocument issue = HyperdataDocument
, _hd_publication_minute = Just (todMin tod)
, _hd_publication_second = Just (round $ todSec tod)
, _hd_language_iso2 = Just $ (DT.pack . show) lang
, _hd_institutes_tree = Nothing
}
where lang = EN
date = _issue_created issue
......
......@@ -89,7 +89,8 @@ instance ToHyperdataDocument GrandDebatReference
, _hd_publication_hour = Nothing
, _hd_publication_minute = Nothing
, _hd_publication_second = Nothing
, _hd_language_iso2 = Just $ Text.pack $ show FR }
, _hd_language_iso2 = Just $ Text.pack $ show FR
, _hd_institutes_tree = Nothing }
where
toAbstract = Text.intercalate " . " . (filter (/= "") . map toSentence)
toSentence (GrandDebatResponse _id _qtitle _qvalue r) = case r of
......
......@@ -135,6 +135,7 @@ bind2doc l [ link', date, langDoc, authors, _source, publisher, title, abstract
, _hd_publication_hour = Nothing
, _hd_publication_minute = Nothing
, _hd_publication_second = Nothing
, _hd_language_iso2 = unbound l langDoc }
, _hd_language_iso2 = unbound l langDoc
, _hd_institutes_tree = Nothing }
bind2doc _ _ = undefined
......@@ -50,5 +50,6 @@ toDoc la (ISTEX.Document i t a ab d s) = do
, _hd_publication_minute = Nothing
, _hd_publication_second = Nothing
, _hd_language_iso2 = Just $ (T.pack . show) la
, _hd_institutes_tree = Nothing
}
......@@ -75,7 +75,8 @@ toDoc (TsvGargV3 did dt _ dpy dpm dpd dab dau) =
, _hd_publication_hour = Nothing
, _hd_publication_minute = Nothing
, _hd_publication_second = Nothing
, _hd_language_iso2 = Nothing }
, _hd_language_iso2 = Nothing
, _hd_institutes_tree = Nothing }
---------------------------------------------------------------
-- | Types Conversions
......@@ -537,7 +538,8 @@ tsvHal2doc (TsvHal { .. }) =
, _hd_publication_hour = Nothing
, _hd_publication_minute = Nothing
, _hd_publication_second = Nothing
, _hd_language_iso2 = Nothing }
, _hd_language_iso2 = Nothing
, _hd_institutes_tree = Nothing }
tsv2doc :: TsvDoc -> HyperdataDocument
......@@ -560,7 +562,8 @@ tsv2doc (TsvDoc { .. })
, _hd_publication_hour = Nothing
, _hd_publication_minute = Nothing
, _hd_publication_second = Nothing
, _hd_language_iso2 = Nothing }
, _hd_language_iso2 = Nothing
, _hd_institutes_tree = Nothing }
where
pubYear = fromMIntOrDec defaultYear tsv_publication_year
pubMonth = fromMaybe defaultMonth tsv_publication_month
......
......@@ -93,7 +93,8 @@ wikiPageToDocument m wr = do
, _hd_publication_hour = hour
, _hd_publication_minute = minute
, _hd_publication_second = sec
, _hd_language_iso2 = iso2 }
, _hd_language_iso2 = iso2
, _hd_institutes_tree = Nothing }
wikidataSelect :: Int -> IO [WikiResult]
......
......@@ -28,7 +28,7 @@ import Gargantext.API.Ngrams.Types (NgramsElement, NgramsTerm(..))
import Gargantext.Core.NLP (HasNLPServer)
import Gargantext.Core.NodeStory.Types ( HasNodeStory )
import Gargantext.Core.Text (size)
import Gargantext.Core.Text.List.Group ( toGroupedTree, setScoresWithMap )
import Gargantext.Core.Text.List.Group ( toGroupedTree, setScoresWithMap, toGroupedTreeInstitutes )
import Gargantext.Core.Text.List.Group.Prelude
import Gargantext.Core.Text.List.Group.WithStem
import Gargantext.Core.Text.List.Social ( FlowSocialListWith, flowSocialList )
......@@ -38,7 +38,7 @@ import Gargantext.Core.Text.Ngrams (NgramsType(..), Ngrams(..))
import Gargantext.Core.Types.Individu (User(..))
import Gargantext.Core.Types.Main ( ListType(..) )
import Gargantext.Data.HashMap.Strict.Utils qualified as HashMap
import Gargantext.Database.Action.Metrics.NgramsByContext (getContextsByNgramsUser, getContextsByNgramsOnlyUser)
import Gargantext.Database.Action.Metrics.NgramsByContext (getContextsByNgramsUser, getContextsByNgramsOnlyUser, getTreeInstitutesUser)
import Gargantext.Database.Action.Metrics.TFICF (getTficf_withSample)
import Gargantext.Database.Admin.Types.Node ( MasterCorpusId, UserCorpusId, ContextId )
import Gargantext.Database.Prelude (DBCmd)
......@@ -77,18 +77,64 @@ buildNgramsLists :: ( HasNodeStory env err m
-> m (Map NgramsType [NgramsElement])
buildNgramsLists user uCid mCid mfslw gp = do
ngTerms <- buildNgramsTermsList user uCid mCid mfslw gp (NgramsTerms, MapListSize goodMapListSize)
instTerms <- buildNgramsInstList user uCid mfslw GroupIdentity (Institutes, MapListSize 300, MaxListSize 1000)
othersTerms <- mapM (buildNgramsOthersList user uCid mfslw GroupIdentity)
[ (Authors , MapListSize 9, MaxListSize 1000)
, (Sources , MapListSize 9, MaxListSize 1000)
, (Institutes, MapListSize 9, MaxListSize 1000)
]
pure $ Map.unions $ [ngTerms] <> othersTerms
pure $ Map.unions $ [ngTerms] <> othersTerms <> [instTerms]
newtype MapListSize = MapListSize { unMapListSize :: Int }
newtype MaxListSize = MaxListSize { unMaxListSize :: Int }
buildNgramsInstList :: ( HasNodeError err
, HasNLPServer env
, HasNodeStory env err m
, HasTreeError err
)
=> User
-> UserCorpusId
-> Maybe FlowSocialListWith
-> GroupParams
-> (NgramsType, MapListSize, MaxListSize)
-> m (Map NgramsType [NgramsElement])
buildNgramsInstList user uCid mfslw _groupParams (nt, MapListSize mapListSize, MaxListSize maxListSize) = do
allTerms :: HashMap NgramsTerm (Set ContextId) <- getContextsByNgramsUser uCid nt
institutesTree :: HashMap Text [Text] <- getTreeInstitutesUser uCid nt
-- PrivateFirst for first developments since Public NodeMode is not implemented yet
socialLists :: FlowCont NgramsTerm FlowListScores
<- flowSocialList mfslw user nt ( FlowCont HashMap.empty
$ HashMap.fromList
$ List.zip (HashMap.keys allTerms)
(repeat mempty)
)
let
groupedWithList = toGroupedTreeInstitutes {- groupParams -} socialLists allTerms institutesTree
(stopTerms, tailTerms) = HashMap.partition ((== Just StopTerm) . viewListType)
$ view flc_scores groupedWithList
(mapTerms, tailTerms') = HashMap.partition ((== Just MapTerm) . viewListType) tailTerms
listSize = mapListSize - List.length mapTerms
(mapTerms', candiTerms) = both HashMap.fromList
$ List.splitAt listSize
$ List.take maxListSize
$ List.sortOn (Down . viewScore . snd)
$ HashMap.toList tailTerms'
pure $ Map.fromList [( nt, List.take maxListSize $ toNgramsElement stopTerms
<> toNgramsElement mapTerms
<> toNgramsElement (setListType (Just MapTerm ) mapTerms')
<> toNgramsElement (setListType (Just CandidateTerm) candiTerms)
)]
buildNgramsOthersList :: ( HasNodeError err
, HasNLPServer env
, HasNodeStory env err m
......
......@@ -20,20 +20,29 @@ import Data.HashMap.Strict (HashMap)
import Data.HashMap.Strict qualified as HashMap
import Gargantext.API.Ngrams.Types (NgramsTerm(..))
import Gargantext.Core.Text.List.Group.Prelude
import Gargantext.Core.Text.List.Group.WithScores ( groupWithScores' )
import Gargantext.Core.Text.List.Group.WithScores ( groupWithScores', groupWithScoresInstitutes' )
import Gargantext.Core.Text.List.Social.Prelude ( FlowListScores, FlowCont )
import Gargantext.Prelude
------------------------------------------------------------------------
toGroupedTree :: (Ord a, Monoid a, HasSize a)
=> FlowCont NgramsTerm FlowListScores
-> HashMap NgramsTerm a
-> FlowCont NgramsTerm (GroupedTreeScores a)
-> FlowCont NgramsTerm (GroupedTreeScores a) -- a = (Set ContextId)
toGroupedTree flc scores =
groupWithScores' flc scoring
where
scoring t = fromMaybe mempty $ HashMap.lookup t scores
toGroupedTreeInstitutes :: (Ord a, Monoid a, HasSize a)
=> FlowCont NgramsTerm FlowListScores
-> HashMap NgramsTerm a
-> HashMap Text [Text]
-> FlowCont NgramsTerm (GroupedTreeScores a) -- a = (Set ContextId)
toGroupedTreeInstitutes flc scores institutesTree =
groupWithScoresInstitutes' flc scoring institutesTree
where
scoring t = fromMaybe mempty $ HashMap.lookup t scores
------------------------------------------------------------------------
setScoresWithMap :: (Ord a, Ord b, Monoid b) => HashMap NgramsTerm b
-> HashMap NgramsTerm (GroupedTreeScores a)
......
......@@ -41,6 +41,22 @@ groupWithScores' flc scores = FlowCont groups orphans
-- orphans should be filtered already then becomes empty
orphans = mempty
groupWithScoresInstitutes':: (Eq a, Ord a, Monoid a, HasSize a)
=> FlowCont NgramsTerm FlowListScores
-> (NgramsTerm -> a)
-> HashMap Text [Text]
-> FlowCont NgramsTerm (GroupedTreeScores a)
groupWithScoresInstitutes' flc scores institutesTree = FlowCont (groups institutesTree) orphans
where
-- parent/child relation is inherited from social lists
groups institutesTree' = HashMap.filter ((0 <) . viewScore)
$ toGroupedTreeInstitutes' institutesTree'
$ toMapMaybeParent scores
$ view flc_scores flc <> view flc_cont flc
-- orphans should be filtered already then becomes empty
orphans = mempty
------------------------------------------------------------------------
toMapMaybeParent :: (Eq a, Ord a, Monoid a)
......@@ -72,6 +88,14 @@ toGroupedTree' m = case HashMap.lookup Nothing m of
Nothing -> mempty
Just m' -> toGroupedTree'' m m'
toGroupedTreeInstitutes' :: Eq a
=> HashMap Text [Text]
-> HashMap (Maybe Parent) (HashMap NgramsTerm (GroupedTreeScores a))
-> HashMap Parent (GroupedTreeScores a)
toGroupedTreeInstitutes' institutesTree m = case HashMap.lookup Nothing m of
Nothing -> mempty
Just m' -> toGroupedTreeInstitutes'' m m' institutesTree
filterGroupedTree :: (GroupedTreeScores a -> Bool)
-> HashMap Parent (GroupedTreeScores a)
-> HashMap Parent (GroupedTreeScores a)
......@@ -93,3 +117,37 @@ toGroupedTree'' m notEmpty
)
v
toGroupedTreeInstitutes'' :: Eq a => HashMap (Maybe Parent) (HashMap NgramsTerm (GroupedTreeScores a))
-> HashMap NgramsTerm (GroupedTreeScores a)
-> HashMap Text [Text]
-> HashMap Parent (GroupedTreeScores a)
toGroupedTreeInstitutes'' m notEmpty institutesTree
| notEmpty == mempty = mempty
| otherwise = HashMap.mapWithKey (addGroup institutesTree m) notEmpty
where
addGroup :: (Eq score)
=> HashMap Text [Text]
-> HashMap (Maybe Parent) (HashMap NgramsTerm (GroupedTreeScores score))
-> Parent
-> GroupedTreeScores score
-> GroupedTreeScores score
addGroup institutesTree' m' k v =
over gts'_children ( toGroupedTree'' m'
. case HashMap.lookup (unNgramsTerm k) institutesTree' of
Nothing -> HashMap.union ( fromMaybe mempty
$ HashMap.lookup (Just k) m'
)
Just children -> HashMap.union (foldl (\acc child -> HashMap.union acc $
HashMap.singleton
(NgramsTerm child)
GroupedTreeScores
{ _gts'_score= _gts'_score v
, _gts'_listType= _gts'_listType v
, _gts'_children= HashMap.empty
}) HashMap.empty children
)
. HashMap.union ( fromMaybe mempty
$ HashMap.lookup (Just k) m'
)
)
v
\ No newline at end of file
......@@ -30,6 +30,7 @@ import Gargantext.API.Ngrams.Types (NgramsTerm(..))
import Gargantext.Core ( HasDBid(toDBid) )
import Gargantext.Core.Text.Ngrams (NgramsType(..))
import Gargantext.Data.HashMap.Strict.Utils as HM ( unionsWith )
import Gargantext.Database.Admin.Types.Hyperdata.Document
import Gargantext.Database.Admin.Types.Node (ListId, CorpusId, NodeId(..), ContextId (..), MasterCorpusId, NodeType(NodeDocument), UserCorpusId, DocId)
import Gargantext.Database.Prelude (DBCmd, runPGSQuery)
import Gargantext.Database.Schema.Ngrams () -- toDBid instance
......@@ -91,6 +92,34 @@ getContextsByNgramsUser cId nt =
GROUP BY cng.context_id, ng.terms
|]
getTreeInstitutesUser :: HasDBid NodeType
=> CorpusId
-> NgramsType
-> DBCmd err (HashMap Text [Text])
getTreeInstitutesUser cId nt =
HM.unionsWith (++) . map (\(_, hd) -> HM.fromList $ map (\(p, c) -> (p, [c])) $ Map.toList $ fromMaybe Map.empty (_hd_institutes_tree hd)) <$> selectHyperDataByContextUser cId nt
selectHyperDataByContextUser :: HasDBid NodeType
=> CorpusId
-> NgramsType
-> DBCmd err [(ContextId, HyperdataDocument)]
selectHyperDataByContextUser cId' nt' =
runPGSQuery queryHyperDataByContextUser
( cId'
, toDBid nt'
)
queryHyperDataByContextUser :: DPS.Query
queryHyperDataByContextUser = [sql|
SELECT cng.context_id, c.hyperdata FROM context_node_ngrams cng
JOIN ngrams ng ON cng.ngrams_id = ng.id
JOIN nodes_contexts nc ON nc.context_id = cng.context_id
JOIN contexts c ON nc.context_id = c.id
WHERE nc.node_id = ? -- CorpusId
AND cng.ngrams_type = ? -- NgramsTypeId
AND nc.category > 0 -- is not in Trash
GROUP BY cng.context_id, c.hyperdata
|]
------------------------------------------------------------------------
getOccByNgramsOnlyFast_withSample :: HasDBid NodeType
......
......@@ -36,6 +36,7 @@ data HyperdataDocument = HyperdataDocument { _hd_bdd :: !(Maybe T
, _hd_publication_minute :: !(Maybe Int)
, _hd_publication_second :: !(Maybe Int)
, _hd_language_iso2 :: !(Maybe Text)
, _hd_institutes_tree :: !(Maybe (Map Text Text))
}
deriving (Show, Generic)
......@@ -53,7 +54,7 @@ defaultHyperdataDocument = case decode docExample of
Nothing Nothing Nothing Nothing
Nothing Nothing Nothing Nothing
Nothing Nothing Nothing Nothing
Nothing Nothing Nothing
Nothing Nothing Nothing Nothing
where
docExample :: ByteString
......@@ -120,7 +121,7 @@ arbitraryHyperdataDocuments =
toHyperdataDocument' (t1,t2) =
HyperdataDocument Nothing Nothing Nothing Nothing (Just t1)
Nothing Nothing (Just t2) Nothing Nothing Nothing Nothing Nothing
Nothing Nothing Nothing Nothing
Nothing Nothing Nothing Nothing Nothing
......
......@@ -134,7 +134,7 @@
git: "https://gitlab.iscpif.fr/gargantext/crawlers/epo-proxy-api.git"
subdirs:
- .
- commit: 1dbd939257d33126e49d2679375553df1f2eebc5
- commit: d54812d52c9d1f86d331a991b3a87c9a8b4379cf
git: "https://gitlab.iscpif.fr/gargantext/crawlers/hal.git"
subdirs:
- .
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment