Commit b6f257b3 authored by Alfredo Di Napoli's avatar Alfredo Di Napoli

Merge branch 'adinapoli/issue-342' into 'dev'

Forest of trees: restore hierarchical grouping of terms

Closes #313

See merge request !424
parents 3755ded0 186214c6
Pipeline #7785 passed with stages
in 98 minutes and 52 seconds
......@@ -570,6 +570,7 @@ library
, json-stream ^>= 0.4.2.4
, lens >= 5.2.2 && < 5.3
, lens-aeson < 1.3
, list-zipper
, massiv < 1.1
, matrix ^>= 0.3.6.1
, mime-mail >= 0.5.1
......
This diff is collapsed.
......@@ -10,10 +10,17 @@ Portability : POSIX
-}
{-# LANGUAGE TemplateHaskell #-}
module Gargantext.API.Ngrams.NgramsTree
where
( -- * Types
NgramsForest(..)
, NgramsTree
, GeneralisedNgramsTree(..)
-- * Construction
, toNgramsTree
, toNgramsForest
) where
import Data.Aeson
import Data.HashMap.Strict (HashMap)
import Data.HashMap.Strict qualified as HashMap
import Data.List qualified as List
......@@ -23,41 +30,80 @@ import Data.Tree ( Tree(Node), unfoldForest )
import Gargantext.API.Ngrams.Types
import Gargantext.Core.Types.Main ( ListType(..) )
import Gargantext.Database.Admin.Types.Node ( NodeId )
import Gargantext.Core.Utils.Prefix (unPrefix, unPrefixSwagger)
import Gargantext.Core.Utils.Prefix (unPrefixSwagger)
import Gargantext.Prelude
import Test.QuickCheck ( Arbitrary(arbitrary) )
type Children = Text
type Root = Text
-- | Ngrams forms a forest, i.e. a set of trees, each tree represents a strong grouping
-- between terms. Each tree has a root and some children, of arbitrary depth. We use
-- this data structure internally to represent ngrams tables in a principled way, and later
-- we \"render\" them back into an 'NgramsTable' and/or a set of ngrams elements, where
-- each 'NgramElement' is a standalone tree.
--
-- Properties:
--
-- * Aciclic: each tree is a DAG, and therefore there must be no cycles within the tree,
-- and no cycles between trees in the forest.
--
-- /NOTE/: An 'NgramsForest' and a 'GeneralisedNgramsTree' are essentially isomorphic to the \"Tree\"
-- and \"Forest\" types from the \"containers\" library, but our version allows storing both a label and
-- a value for each node.
newtype NgramsForest =
NgramsForest { getNgramsForest :: [NgramsTree] }
deriving (Show, Eq, Ord)
type NgramsTree = GeneralisedNgramsTree Text Int
data NgramsTree = NgramsTree { mt_label :: Text
, mt_value :: Double
, mt_children :: [NgramsTree]
}
deriving (Generic, Show, Eq)
-- | Models a general ngram tree polymorphic over a label 'l' and a measure 'm'.
data GeneralisedNgramsTree l m =
GeneralisedNgramsTree { mt_label :: l
, mt_value :: m
, mt_children :: [GeneralisedNgramsTree l m]
}
deriving (Generic, Show, Eq, Ord)
toNgramsTree :: Tree (NgramsTerm,Double) -> NgramsTree
toNgramsTree (Node (NgramsTerm l,v) xs) = NgramsTree l v (map toNgramsTree xs)
instance (ToJSON l, ToJSON m) => ToJSON (GeneralisedNgramsTree l m) where
toJSON (GeneralisedNgramsTree l m children) =
object [ "label" .= toJSON l
, "value" .= toJSON m
, "children" .= toJSON children
]
deriveJSON (unPrefix "mt_") ''NgramsTree
instance (FromJSON l, FromJSON m) => FromJSON (GeneralisedNgramsTree l m) where
parseJSON = withObject "NgramsTree" $ \o -> do
mt_label <- o .: "label"
mt_value <- o .: "value"
mt_children <- o .: "children"
pure $ GeneralisedNgramsTree{..}
instance ToSchema NgramsTree where
declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "mt_")
instance Arbitrary NgramsTree
where
arbitrary = NgramsTree <$> arbitrary <*> arbitrary <*> arbitrary
arbitrary = GeneralisedNgramsTree <$> arbitrary <*> arbitrary <*> arbitrary
--
-- Constructing trees and forests
--
-- | Given a 'Tree' from the \"containers\" library that has an 'NgramsTerm' and a score at the leaves,
-- converts it into a gargantext 'NgramsTree' tree.
toNgramsTree :: Tree (NgramsTerm,Int) -> NgramsTree
toNgramsTree (Node (NgramsTerm l,v) xs) = GeneralisedNgramsTree l v (map toNgramsTree xs)
toTree :: ListType
-> HashMap NgramsTerm (Set NodeId)
-> HashMap NgramsTerm NgramsRepoElement
-> [NgramsTree]
toTree lt vs m = map toNgramsTree $ unfoldForest buildNode roots
-- | Given a 'ListType', which informs which category of terms we want to focus on (stop, map, candidate)
-- and two hashmaps mapping an 'NgramsTerm' to their values, builds an 'NgramsForest'.
toNgramsForest :: ListType
-> HashMap NgramsTerm (Set NodeId)
-> HashMap NgramsTerm NgramsRepoElement
-> NgramsForest
toNgramsForest lt vs m = NgramsForest $ map toNgramsTree $ unfoldForest buildNode roots
where
buildNode r = maybe ((r, value r),[])
(\x -> ((r, value r), mSetToList $ _nre_children x))
(HashMap.lookup r m)
value l = maybe 0 (fromIntegral . Set.size) $ HashMap.lookup l vs
value l = maybe 0 Set.size $ HashMap.lookup l vs
rootsCandidates :: [NgramsTerm]
rootsCandidates = catMaybes
......
This diff is collapsed.
......@@ -17,7 +17,7 @@ import Data.List qualified as List
import Data.Map.Strict (toList)
import Data.Set qualified as Set
import Data.Vector qualified as V
import Gargantext.API.Ngrams.NgramsTree ( toTree, NgramsTree )
import Gargantext.API.Ngrams.NgramsTree ( toNgramsForest, NgramsTree, getNgramsForest )
import Gargantext.API.Ngrams.Tools ( filterListWithRoot, getListNgrams, getRepo, mapTermListRoot )
import Gargantext.API.Ngrams.Types ( NgramsTerm(NgramsTerm) )
import Gargantext.Core.NodeStory.Types ( NodeStoryEnv )
......@@ -91,4 +91,4 @@ treeData env cId nt lt = do
cs' <- HashMap.map (Set.map contextId2NodeId) <$> getContextsByNgramsOnlyUser cId (ls' <> ls) nt terms
m <- getListNgrams env ls nt
pure $ V.fromList $ toTree lt cs' m
pure $ V.fromList $ getNgramsForest $ toNgramsForest lt cs' m
......@@ -18,6 +18,7 @@
- "data-default-0.8.0.0"
- "data-default-class-0.2.0.0"
- "deferred-folds-0.9.18.7"
- "deriving-compat-0.6.7"
- "entropy-0.4.1.11"
- "file-embed-lzma-0.1"
- "foldl-1.4.18"
......@@ -38,6 +39,7 @@
- "jose-0.10.0.1"
- "language-c-0.10.0"
- "linear-1.23"
- "list-zipper-0.0.12"
- "massiv-1.0.4.1"
- "megaparsec-9.7.0"
- "microlens-th-0.4.3.16"
......@@ -367,7 +369,7 @@ flags:
gargantext:
"enable-benchmarks": false
"no-phylo-debug-logs": true
"test-crypto": false
"test-crypto": true
graphviz:
"test-parsing": false
hashable:
......@@ -540,8 +542,6 @@ flags:
transformers: true
tasty:
unix: true
"tasty-golden":
"build-example": false
"text-format":
developer: false
"text-metrics":
......
This diff is collapsed.
......@@ -12,6 +12,7 @@ Portability : POSIX
{-# OPTIONS_GHC -fconstraint-solver-iterations=10 #-}
{-# LANGUAGE StandaloneDeriving #-}
{-# LANGUAGE ScopedTypeVariables #-}
module Test.Instances
where
......@@ -382,15 +383,133 @@ instance Arbitrary DET.WSRequest where
, pure DET.WSDeauthorize ]
arbitraryNgramsTerm :: Gen Ngrams.NgramsTerm
arbitraryNgramsTerm = elements
[ "time"
, "year"
, "people"
, "way"
, "day"
, "man"
, "thing"
, "woman"
, "life"
, "child"
, "world"
, "school"
, "state"
, "family"
, "student"
, "group"
, "country"
, "problem"
, "hand"
, "part"
, "place"
, "case"
, "week"
, "company"
, "system"
, "program"
, "question"
, "work"
, "government"
, "number"
, "night"
, "point"
, "home"
, "water"
, "room"
, "mother"
, "area"
, "money"
, "story"
, "fact"
, "month"
, "lot"
, "right"
, "study"
, "book"
, "eye"
, "job"
, "word"
, "business"
, "issue"
, "side"
, "kind"
, "head"
, "house"
, "service"
, "friend"
, "father"
, "power"
, "hour"
, "game"
, "line"
, "end"
, "member"
, "law"
, "car"
, "city"
, "community"
, "name"
, "president"
, "team"
, "minute"
, "idea"
, "kid"
, "body"
, "information"
, "back"
, "parent"
, "face"
, "others"
, "level"
, "office"
, "door"
, "health"
, "person"
, "art"
, "war"
, "history"
, "party"
, "result"
, "change"
, "morning"
, "reason"
, "research"
, "girl"
, "guy"
, "moment"
, "air"
, "teacher"
, "force"
, "education"
]
-- Ngrams
instance Arbitrary a => Arbitrary (Ngrams.MSet a)
-- We cannot pick some completely arbitrary values for the ngrams terms,
-- see the rationale in the instance for 'NgramsElement'.
instance Arbitrary Ngrams.NgramsTerm where
arbitrary = Ngrams.NgramsTerm <$>
-- we take into accoutn the fact, that tojsonkey strips the text
(arbitrary `suchThat` (\t -> t == T.strip t))
arbitrary = arbitraryNgramsTerm
instance Arbitrary Ngrams.TabType where arbitrary = arbitraryBoundedEnum
instance Arbitrary Ngrams.NgramsElement where
arbitrary = elements [Ngrams.newNgramsElement Nothing "sport"]
-- We cannot pick some completely arbitrary values for the ngrams elements
-- because we still want to simulate potential hierarchies, i.e. forests of ngrams.
-- so we sample the ngrams terms from a selection, and we restrict the number of max
-- children for each 'NgramsElement' to the size parameter to not have very large trees.
arbitrary = do
_ne_ngrams <- arbitrary
_ne_size <- arbitrary
_ne_list <- arbitrary
_ne_occurrences <- arbitrary
_ne_root <- arbitrary `suchThat` (maybe True (\x -> x /= _ne_ngrams)) -- can't be root of itself
_ne_parent <- arbitrary `suchThat` (maybe True (\x -> x /= _ne_ngrams)) -- can't be parent of itself
_ne_children <- Ngrams.mSetFromList <$> (sized (\n -> vectorOf n arbitrary `suchThat` (\x -> _ne_ngrams `notElem` x))) -- can't be cyclic
pure Ngrams.NgramsElement{..}
instance Arbitrary Ngrams.NgramsTable where
arbitrary = pure ngramsMockTable
instance Arbitrary Ngrams.OrderBy where arbitrary = arbitraryBoundedEnum
......
{-# LANGUAGE TypeApplications #-}
module Test.Ngrams.Query (tests) where
module Test.Ngrams.Query (tests, mkMapTerm) where
import Control.Monad
import Data.Coerce
......
{-# LANGUAGE QuasiQuotes #-}
{-# LANGUAGE TypeApplications #-}
{-# LANGUAGE ViewPatterns #-}
module Test.Offline.Ngrams (tests) where
import Prelude
import Control.Lens
import Data.Map.Strict qualified as Map
import Data.Text qualified as T
import Gargantext.API.Ngrams (filterNgramsNodes, buildForest, destroyForest, pruneForest)
import Gargantext.API.Ngrams.Types
import Gargantext.API.Ngrams.Types qualified as NT
import Gargantext.Core
import Gargantext.Core.Text.Terms.Mono (isSep)
import Gargantext.Core.Text.Terms.WithList
import Gargantext.Core.Types
import Gargantext.Database.Action.Flow.Utils (docNgrams)
import Gargantext.Database.Schema.Context
import Gargantext.Database.Admin.Types.Hyperdata
import Gargantext.Database.Schema.Context
import Test.HUnit
import Test.Hspec
import Test.Instances ()
import Test.Ngrams.Query (mkMapTerm)
import Test.QuickCheck
import Test.Hspec
import Control.Lens
import qualified Test.QuickCheck as QC
import Gargantext.Core.Text.Terms.Mono (isSep)
import Test.QuickCheck qualified as QC
import Data.Tree
import Text.RawString.QQ (r)
import Data.Char (isSpace)
import Data.Map.Strict (Map)
import Test.Hspec.QuickCheck (prop)
genScientificText :: Gen T.Text
......@@ -89,6 +101,31 @@ tests = describe "Ngrams" $ do
it "return results for non-empty input terms" $ property testBuildPatternsNonEmpty
describe "docNgrams" $ do
it "always matches if the input text contains any of the terms" $ property testDocNgramsOKMatch
describe "ngram forests" $ do
it "building a simple tree works" testBuildNgramsTree_01
it "building a complex tree works" testBuildNgramsTree_02
it "building a complex deep tree works" testBuildNgramsTree_03
it "pruning a simple tree works" testPruningNgramsForest_01
it "pruning a complex tree works" testPruningNgramsForest_02
prop "destroyForest . buildForest === id" buildDestroyForestRoundtrips
describe "hierarchical grouping" $ do
it "filterNgramsNodes with empty query is identity" testFilterNgramsNodesEmptyQuery
hierarchicalTableMap :: Map NgramsTerm NgramsElement
hierarchicalTableMap = Map.fromList [
("vehicle", mkMapTerm "vehicle" & ne_children .~ mSetFromList ["car"])
, ("car", mkMapTerm "car" & ne_root .~ Just "vehicle"
& ne_parent .~ Just "vehicle"
& ne_children .~ mSetFromList ["ford"])
, ("ford", mkMapTerm "ford" & ne_root .~ Just "vehicle"
& ne_parent .~ Just "car")
]
testFilterNgramsNodesEmptyQuery :: Assertion
testFilterNgramsNodesEmptyQuery = do
let input = hierarchicalTableMap
let actual = filterNgramsNodes (Just MapTerm) Nothing Nothing (const True) input
actual @?= input
testDocNgramsOKMatch :: Lang -> DocumentWithMatches -> Property
testDocNgramsOKMatch lang (DocumentWithMatches ts doc) =
......@@ -103,3 +140,166 @@ testBuildPatternsNonEmpty :: Lang -> NonEmptyList NgramsTermNonEmpty -> Property
testBuildPatternsNonEmpty lang ts =
let ts' = map (NT.NgramsTerm . unNgramsTermNonEmpty) $ getNonEmpty ts
in counterexample "buildPatterns returned no results" $ length (buildPatternsWith lang ts') > 0
newtype ASCIIForest = ASCIIForest String
deriving Eq
instance Show ASCIIForest where
show (ASCIIForest x) = x
compareForestVisually :: Forest NgramsElement -> String -> Property
compareForestVisually f expected =
let actual = init $ drawForest (map (fmap renderEl) f)
outermostIndentation = T.length . T.takeWhile isSpace . T.dropWhile (=='\n') . T.pack $ expected
in ASCIIForest actual === ASCIIForest (sanitiseDrawing outermostIndentation expected)
where
renderEl :: NgramsElement -> String
renderEl = T.unpack . unNgramsTerm . _ne_ngrams
toTextPaths :: String -> [T.Text]
toTextPaths = T.splitOn "\n" . T.strip . T.pack
sanitiseDrawing :: Int -> String -> String
sanitiseDrawing outermostIndentation =
let dropLayout t = case T.uncons t of
Just (' ', _) -> T.drop outermostIndentation t
_ -> t -- leave it be
in T.unpack . T.unlines . map dropLayout . toTextPaths
testBuildNgramsTree_01 :: Property
testBuildNgramsTree_01 =
let t1 = Map.fromList [ ( "foo", mkMapTerm "foo" & ne_children .~ mSetFromList ["bar"])
, ( "bar", mkMapTerm "bar" & ne_parent .~ Just "foo")
]
in (buildForest t1) `compareForestVisually` [r|
bar
foo
|
`- bar
|]
testBuildNgramsTree_02 :: Property
testBuildNgramsTree_02 =
buildForest hierarchicalTableMap `compareForestVisually` [r|
car
|
`- ford
ford
vehicle
|
`- car
|
`- ford
|]
testBuildNgramsTree_03 :: Property
testBuildNgramsTree_03 =
let input = Map.fromList [
("animalia", mkMapTerm "animalia" & ne_children .~ mSetFromList ["chordata"])
, ("chordata", mkMapTerm "chordata" & ne_root .~ Just "animalia"
& ne_parent .~ Just "animalia"
& ne_children .~ mSetFromList ["mammalia"])
, ("mammalia", mkMapTerm "mammalia" & ne_root .~ Just "animalia"
& ne_parent .~ Just "chordata"
& ne_children .~ mSetFromList ["carnivora", "primates"]
)
, ("carnivora", mkMapTerm "carnivora" & ne_root .~ Just "animalia"
& ne_parent .~ Just "mammalia"
& ne_children .~ mSetFromList ["felidae"]
)
, ("felidae", mkMapTerm "felidae" & ne_root .~ Just "animalia"
& ne_parent .~ Just "carnivora"
& ne_children .~ mSetFromList ["panthera"]
)
, ("panthera", mkMapTerm "panthera" & ne_root .~ Just "animalia"
& ne_parent .~ Just "felidae"
& ne_children .~ mSetFromList ["panthera leo", "panthera tigris"]
)
, ("panthera leo", mkMapTerm "panthera leo" & ne_root .~ Just "animalia"
& ne_parent .~ Just "pathera"
)
, ("panthera tigris", mkMapTerm "panthera tigris" & ne_root .~ Just "animalia"
& ne_parent .~ Just "panthera"
)
, ("panthera tigris", mkMapTerm "panthera tigris" & ne_root .~ Just "animalia"
& ne_parent .~ Just "panthera"
)
, ("primates", mkMapTerm "primates" & ne_root .~ Just "animalia"
& ne_parent .~ Just "mammalia"
& ne_children .~ mSetFromList ["hominidae"]
)
, ("hominidae", mkMapTerm "hominidae" & ne_root .~ Just "animalia"
& ne_parent .~ Just "primates"
& ne_children .~ mSetFromList ["homo"]
)
, ("homo", mkMapTerm "homo" & ne_root .~ Just "animalia"
& ne_parent .~ Just "hominidae"
& ne_children .~ mSetFromList ["homo sapiens"]
)
, ("homo sapies", mkMapTerm "homo sapiens" & ne_root .~ Just "animalia"
& ne_parent .~ Just "homo"
)
]
in pruneForest (buildForest input) `compareForestVisually` [r|
animalia
|
`- chordata
|
`- mammalia
|
+- carnivora
| |
| `- felidae
| |
| `- panthera
| |
| +- panthera leo
| |
| `- panthera tigris
|
`- primates
|
`- hominidae
|
`- homo
|]
newtype TableMapLockStep = TableMapLockStep { getTableMap :: Map NgramsTerm NgramsElement }
deriving (Show, Eq)
instance Arbitrary TableMapLockStep where
arbitrary = do
pairs <- map (\(k,v) -> (k, v & ne_ngrams .~ k)) <$> arbitrary
pure $ TableMapLockStep (Map.fromList pairs)
-- /PRECONDITION/: The '_ne_ngrams' field always matches the 'NgramsTerm', key of the map.
buildDestroyForestRoundtrips :: TableMapLockStep -> Property
buildDestroyForestRoundtrips (TableMapLockStep mp) =
(destroyForest . buildForest $ mp) === mp
testPruningNgramsForest_01 :: Property
testPruningNgramsForest_01 =
let t1 = Map.fromList [ ( "foo", mkMapTerm "foo" & ne_children .~ mSetFromList ["bar"])
, ( "bar", mkMapTerm "bar" & ne_parent .~ Just "foo")
]
in (pruneForest $ buildForest t1) `compareForestVisually` [r|
foo
|
`- bar
|]
testPruningNgramsForest_02 :: Property
testPruningNgramsForest_02 =
(pruneForest $ buildForest hierarchicalTableMap) `compareForestVisually` [r|
vehicle
|
`- car
|
`- ford
|]
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment