Commit dc1e008d authored by Alexandre Delanoë's avatar Alexandre Delanoë

[FEAT] Learn function to export models

parent 61e8d712
...@@ -71,6 +71,7 @@ import Gargantext.Text.Corpus.Parsers (parseFile, FileFormat) ...@@ -71,6 +71,7 @@ import Gargantext.Text.Corpus.Parsers (parseFile, FileFormat)
import qualified Gargantext.Text.Corpus.API.Isidore as Isidore import qualified Gargantext.Text.Corpus.API.Isidore as Isidore
import Gargantext.Text.Terms (TermType(..), tt_lang, extractTerms, uniText) import Gargantext.Text.Terms (TermType(..), tt_lang, extractTerms, uniText)
import Gargantext.Text.Terms.Mono.Stem.En (stemIt) import Gargantext.Text.Terms.Mono.Stem.En (stemIt)
import Gargantext.Prelude.Utils hiding (hash)
import System.FilePath (FilePath) import System.FilePath (FilePath)
import qualified Data.List as List import qualified Data.List as List
import qualified Data.Map as Map import qualified Data.Map as Map
...@@ -124,7 +125,7 @@ flowCorpusDebat :: FlowCmdM env err m ...@@ -124,7 +125,7 @@ flowCorpusDebat :: FlowCmdM env err m
flowCorpusDebat u n l fp = do flowCorpusDebat u n l fp = do
docs <- liftIO ( splitEvery 500 docs <- liftIO ( splitEvery 500
<$> take l <$> take l
<$> GD.readFile fp <$> readFile' fp
:: IO [[GD.GrandDebatReference ]] :: IO [[GD.GrandDebatReference ]]
) )
flowCorpus u n (Multi FR) (map (map toHyperdataDocument) docs) flowCorpus u n (Multi FR) (map (map toHyperdataDocument) docs)
......
...@@ -13,6 +13,8 @@ Portability : POSIX ...@@ -13,6 +13,8 @@ Portability : POSIX
{-# LANGUAGE QuasiQuotes #-} {-# LANGUAGE QuasiQuotes #-}
{-# LANGUAGE RankNTypes #-} {-# LANGUAGE RankNTypes #-}
{-# LANGUAGE OverloadedStrings #-} {-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE FlexibleContexts #-}
{-# LANGUAGE MonoLocalBinds #-}
module Gargantext.Database.Learn where module Gargantext.Database.Learn where
...@@ -27,42 +29,82 @@ import qualified Data.List as List ...@@ -27,42 +29,82 @@ import qualified Data.List as List
import qualified Data.Text as Text import qualified Data.Text as Text
import Gargantext.Database.Schema.NodeNode (nodesToFavorite) import Gargantext.Database.Schema.NodeNode (nodesToFavorite)
import Gargantext.API.Node (delDocs, Documents(..)) import Gargantext.API.Node (delDocs, Documents(..))
import Gargantext.Database.Utils (Cmd) --import Gargantext.Database.Utils (Cmd)
import Gargantext.Database.Schema.Node (HasNodeError) --import Gargantext.Database.Schema.Node (HasNodeError)
import Gargantext.API
import Gargantext.API.Settings
import Gargantext.Database.Flow (FlowCmdM)
text :: FacetDoc -> (NodeId, Text) data FavOrTrash = IsFav | IsTrash
text (FacetDoc nId _ _ h _ _) = (nId, title <> "" <> Text.take 100 abstr)
where
title = maybe "" identity (_hyperdataDocument_title h)
abstr = maybe "" identity (_hyperdataDocument_abstract h)
--moreLike docs_fav docs_trash docs_test = do
data FavTrash = IsFav | IsTrash
deriving (Eq) deriving (Eq)
moreLike :: HasNodeError err => FavTrash -> CorpusId -> Cmd err [(NodeId, Maybe Bool)]
--moreLike :: FlowCmdM env error m => FavOrTrash -> CorpusId -> m (Events Bool, [FacetDoc])
moreLike :: FlowCmdM DevEnv GargError m => FavOrTrash -> CorpusId -> m [FacetDoc]
moreLike ft cId = do moreLike ft cId = do
let b = if (==) ft IsFav then True else False priors <- getPriors ft cId
moreLikeWith priors ft cId
---------------------------------------------------------------------------
getPriors :: FlowCmdM DevEnv GargError m => FavOrTrash -> CorpusId -> m (Events Bool)
getPriors ft cId = do
docs_trash <- runViewDocuments cId True Nothing Nothing Nothing
docs_trash <- map text <$> runViewDocuments cId True Nothing Nothing Nothing docs_fav <- filter (\(FacetDoc _ _ _ _ f _) -> f == True)
docs_fav <- map text <$> filter (\(FacetDoc _ _ _ _ f _) -> f == True) <$> runViewDocuments cId False Nothing Nothing Nothing <$> runViewDocuments cId False Nothing Nothing Nothing
docs_test <- map text <$> filter (\(FacetDoc _ _ _ _ f _) -> f == False) <$> runViewDocuments cId False Nothing Nothing Nothing
let priors = priorEventsWith snd b ( List.zip (repeat False) docs_fav
<> List.zip (repeat True ) docs_trash let priors = priorEventsWith text (fav2bool ft) ( List.zip (repeat False) docs_fav
) <> List.zip (repeat True ) docs_trash
)
pure priors
moreLikeWith :: FlowCmdM DevEnv GargError m => Events Bool -> FavOrTrash -> CorpusId -> m [FacetDoc]
moreLikeWith priors ft cId = do
pure $ filter ((==) (Just $ not b) . snd) $ map (\x -> (fst x, detectDefaultWithPriors snd priors x)) docs_test docs_test <- filter (\(FacetDoc _ _ _ _ f _) -> f == False)
<$> runViewDocuments cId False Nothing Nothing Nothing
learnModify :: HasNodeError err => FavTrash -> CorpusId -> [NodeId] -> Cmd err [Int] let results = map fst
learnModify favTrash cId ns = case favTrash of $ filter ((==) (Just $ not $ fav2bool ft) . snd)
$ map (\f -> (f, detectDefaultWithPriors text priors f)) docs_test
pure results
---------------------------------------------------------------------------
fav2bool :: FavOrTrash -> Bool
fav2bool ft = if (==) ft IsFav then True else False
text :: FacetDoc -> Text
text (FacetDoc _ _ _ h _ _) = title <> "" <> Text.take 100 abstr
where
title = maybe "" identity (_hyperdataDocument_title h)
abstr = maybe "" identity (_hyperdataDocument_abstract h)
---------------------------------------------------------------------------
apply :: (FlowCmdM DevEnv GargError m) => FavOrTrash -> CorpusId -> [NodeId] -> m [Int]
apply favTrash cId ns = case favTrash of
IsFav -> nodesToFavorite $ map (\n -> (cId, n, True)) ns IsFav -> nodesToFavorite $ map (\n -> (cId, n, True)) ns
IsTrash -> delDocs cId (Documents ns) IsTrash -> delDocs cId (Documents ns)
learnAndApply :: HasNodeError err => FavTrash -> CorpusId -> Cmd err [Int]
learnAndApply ft cId = do
ids <- map fst <$> moreLike ft cId
learnModify ft cId ids
moreLikeAndApply :: FlowCmdM DevEnv GargError m => FavOrTrash -> CorpusId -> m [Int]
moreLikeAndApply ft cId = do
priors <- getPriors ft cId
moreLikeWithAndApply priors ft cId
moreLikeWithAndApply :: FlowCmdM DevEnv GargError m => Events Bool -> FavOrTrash -> CorpusId -> m [Int]
moreLikeWithAndApply priors ft cId = do
ids <- map facetDoc_id <$> moreLikeWith priors ft cId
apply ft cId ids
...@@ -304,3 +304,6 @@ tail' = listSafeN "tail" tailMay ...@@ -304,3 +304,6 @@ tail' = listSafeN "tail" tailMay
init' :: Text -> [a] -> [a] init' :: Text -> [a] -> [a]
init' = listSafeN "init" initMay init' = listSafeN "init" initMay
------------------------------------------------------------------------
...@@ -54,9 +54,9 @@ class ReadFile a where ...@@ -54,9 +54,9 @@ class ReadFile a where
readFile' :: FilePath -> IO a readFile' :: FilePath -> IO a
saveFile :: (MonadReader env m, MonadIO m, HasSettings env, SaveFile a) writeFile :: (MonadReader env m, MonadIO m, HasSettings env, SaveFile a)
=> a -> m FilePath => a -> m FilePath
saveFile a = do writeFile a = do
dataPath <- view (settings . fileFolder) <$> ask dataPath <- view (settings . fileFolder) <$> ask
(fp,fn) <- liftIO $ (toPath 3) . hash . Text.pack . show <$> newStdGen (fp,fn) <- liftIO $ (toPath 3) . hash . Text.pack . show <$> newStdGen
......
...@@ -19,18 +19,17 @@ TODO: create a separate Lib. ...@@ -19,18 +19,17 @@ TODO: create a separate Lib.
module Gargantext.Text.Corpus.Parsers.GrandDebat module Gargantext.Text.Corpus.Parsers.GrandDebat
where where
import GHC.IO (FilePath)
import Data.Aeson (ToJSON, FromJSON) import Data.Aeson (ToJSON, FromJSON)
import qualified Data.JsonStream.Parser as P
--import Data.Either (either)
import Data.Maybe (Maybe()) import Data.Maybe (Maybe())
import Data.Text (Text) import Data.Text (Text)
import qualified Data.Text as Text
import qualified Data.ByteString.Lazy as DBL
import GHC.Generics (Generic) import GHC.Generics (Generic)
import Gargantext.Prelude
import Gargantext.Database.Types.Node
import Gargantext.Core (Lang(..)) import Gargantext.Core (Lang(..))
import Gargantext.Database.Types.Node
import Gargantext.Prelude
import Gargantext.Prelude.Utils
import qualified Data.ByteString.Lazy as DBL
import qualified Data.JsonStream.Parser as P
import qualified Data.Text as Text
data GrandDebatReference = GrandDebatReference data GrandDebatReference = GrandDebatReference
...@@ -91,16 +90,12 @@ instance ToHyperdataDocument GrandDebatReference ...@@ -91,16 +90,12 @@ instance ToHyperdataDocument GrandDebatReference
True -> r' True -> r'
False -> "" False -> ""
class ReadFile a
where
readFile :: FilePath -> IO a
instance ReadFile [GrandDebatReference] instance ReadFile [GrandDebatReference]
where where
-- | read json: 3 version below are working but with increased optimization -- | read json: 3 version below are working but with increased optimization
--readFile fp = maybe [] identity <$> decode <$> DBL.readFile fp --readFile fp = maybe [] identity <$> decode <$> DBL.readFile fp
--readFile fp = either (panic . Text.pack) identity <$> P.eitherDecode <$> DBL.readFile fp --readFile fp = either (panic . Text.pack) identity <$> P.eitherDecode <$> DBL.readFile fp
readFile fp = P.parseLazyByteString (P.arrayOf P.value) <$> DBL.readFile fp readFile' fp = P.parseLazyByteString (P.arrayOf P.value) <$> DBL.readFile fp
...@@ -18,24 +18,30 @@ Main type here is String. ...@@ -18,24 +18,30 @@ Main type here is String.
{-# LANGUAGE NoImplicitPrelude #-} {-# LANGUAGE NoImplicitPrelude #-}
{-# LANGUAGE OverloadedStrings #-} {-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE DeriveGeneric #-}
{-# LANGUAGE TypeSynonymInstances #-}
{-# LANGUAGE FlexibleInstances #-}
module Gargantext.Text.Learn -- (detectLang, detectLangs, stopList) module Gargantext.Text.Learn -- (detectLang, detectLangs, stopList)
where where
--import Data.Char (toLower) import Codec.Serialise
import qualified Data.List as DL import qualified Data.List as DL
import Data.Maybe (maybe) import Data.Maybe (maybe)
import Data.Map.Strict (Map, toList) import Data.Map.Strict (Map, toList)
import qualified Data.Map.Strict as DM import qualified Data.Map.Strict as DM
import GHC.Generics
import Data.String (String) import Data.String (String)
import Data.Text (Text) import Data.Text (Text)
import Data.Text (pack, unpack, toLower) import Data.Text (pack, unpack, toLower)
import Data.Tuple.Extra (both) import Data.Tuple.Extra (both)
import qualified Data.ByteString.Lazy as BSL
import Gargantext.Prelude import Gargantext.Prelude
import Gargantext.Prelude.Utils
import Gargantext.Core (Lang(..), allLangs) import Gargantext.Core (Lang(..), allLangs)
import Gargantext.Text.Terms.Mono (words) import Gargantext.Text.Terms.Mono (words)
import Gargantext.Text.Metrics.Count (occurrencesWith) import Gargantext.Text.Metrics.Count (occurrencesWith)
...@@ -172,7 +178,16 @@ toEvents e ns n = foldl' (opEvent (+)) (emptyEvent e ns n) . map (toEvent ns n) ...@@ -172,7 +178,16 @@ toEvents e ns n = foldl' (opEvent (+)) (emptyEvent e ns n) . map (toEvent ns n)
data EventBook = EventBook { events_freq :: Map String Freq data EventBook = EventBook { events_freq :: Map String Freq
, events_n :: Map StringSize TotalFreq , events_n :: Map StringSize TotalFreq
} }
deriving (Show) deriving (Show, Generic)
instance Serialise EventBook
instance (Serialise a, Ord a) => SaveFile (Events a) where
saveFile' f d = BSL.writeFile f (serialise d)
instance (Serialise a, Ord a) => ReadFile (Events a) where
readFile' filepath = deserialise <$> BSL.readFile filepath
emptyEventBook :: [Int] -> Int -> EventBook emptyEventBook :: [Int] -> Int -> EventBook
emptyEventBook ns n = wordToBook ns n " " emptyEventBook ns n = wordToBook ns n " "
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment