Commit dc1e008d authored by Alexandre Delanoë's avatar Alexandre Delanoë

[FEAT] Learn function to export models

parent 61e8d712
......@@ -71,6 +71,7 @@ import Gargantext.Text.Corpus.Parsers (parseFile, FileFormat)
import qualified Gargantext.Text.Corpus.API.Isidore as Isidore
import Gargantext.Text.Terms (TermType(..), tt_lang, extractTerms, uniText)
import Gargantext.Text.Terms.Mono.Stem.En (stemIt)
import Gargantext.Prelude.Utils hiding (hash)
import System.FilePath (FilePath)
import qualified Data.List as List
import qualified Data.Map as Map
......@@ -124,7 +125,7 @@ flowCorpusDebat :: FlowCmdM env err m
flowCorpusDebat u n l fp = do
docs <- liftIO ( splitEvery 500
<$> take l
<$> GD.readFile fp
<$> readFile' fp
:: IO [[GD.GrandDebatReference ]]
)
flowCorpus u n (Multi FR) (map (map toHyperdataDocument) docs)
......
......@@ -13,6 +13,8 @@ Portability : POSIX
{-# LANGUAGE QuasiQuotes #-}
{-# LANGUAGE RankNTypes #-}
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE FlexibleContexts #-}
{-# LANGUAGE MonoLocalBinds #-}
module Gargantext.Database.Learn where
......@@ -27,42 +29,82 @@ import qualified Data.List as List
import qualified Data.Text as Text
import Gargantext.Database.Schema.NodeNode (nodesToFavorite)
import Gargantext.API.Node (delDocs, Documents(..))
import Gargantext.Database.Utils (Cmd)
import Gargantext.Database.Schema.Node (HasNodeError)
--import Gargantext.Database.Utils (Cmd)
--import Gargantext.Database.Schema.Node (HasNodeError)
import Gargantext.API
import Gargantext.API.Settings
import Gargantext.Database.Flow (FlowCmdM)
text :: FacetDoc -> (NodeId, Text)
text (FacetDoc nId _ _ h _ _) = (nId, title <> "" <> Text.take 100 abstr)
where
title = maybe "" identity (_hyperdataDocument_title h)
abstr = maybe "" identity (_hyperdataDocument_abstract h)
--moreLike docs_fav docs_trash docs_test = do
data FavTrash = IsFav | IsTrash
data FavOrTrash = IsFav | IsTrash
deriving (Eq)
moreLike :: HasNodeError err => FavTrash -> CorpusId -> Cmd err [(NodeId, Maybe Bool)]
--moreLike :: FlowCmdM env error m => FavOrTrash -> CorpusId -> m (Events Bool, [FacetDoc])
moreLike :: FlowCmdM DevEnv GargError m => FavOrTrash -> CorpusId -> m [FacetDoc]
moreLike ft cId = do
let b = if (==) ft IsFav then True else False
priors <- getPriors ft cId
moreLikeWith priors ft cId
---------------------------------------------------------------------------
getPriors :: FlowCmdM DevEnv GargError m => FavOrTrash -> CorpusId -> m (Events Bool)
getPriors ft cId = do
docs_trash <- runViewDocuments cId True Nothing Nothing Nothing
docs_trash <- map text <$> runViewDocuments cId True Nothing Nothing Nothing
docs_fav <- map text <$> filter (\(FacetDoc _ _ _ _ f _) -> f == True) <$> runViewDocuments cId False Nothing Nothing Nothing
docs_test <- map text <$> filter (\(FacetDoc _ _ _ _ f _) -> f == False) <$> runViewDocuments cId False Nothing Nothing Nothing
docs_fav <- filter (\(FacetDoc _ _ _ _ f _) -> f == True)
<$> runViewDocuments cId False Nothing Nothing Nothing
let priors = priorEventsWith snd b ( List.zip (repeat False) docs_fav
<> List.zip (repeat True ) docs_trash
)
let priors = priorEventsWith text (fav2bool ft) ( List.zip (repeat False) docs_fav
<> List.zip (repeat True ) docs_trash
)
pure priors
moreLikeWith :: FlowCmdM DevEnv GargError m => Events Bool -> FavOrTrash -> CorpusId -> m [FacetDoc]
moreLikeWith priors ft cId = do
pure $ filter ((==) (Just $ not b) . snd) $ map (\x -> (fst x, detectDefaultWithPriors snd priors x)) docs_test
docs_test <- filter (\(FacetDoc _ _ _ _ f _) -> f == False)
<$> runViewDocuments cId False Nothing Nothing Nothing
learnModify :: HasNodeError err => FavTrash -> CorpusId -> [NodeId] -> Cmd err [Int]
learnModify favTrash cId ns = case favTrash of
let results = map fst
$ filter ((==) (Just $ not $ fav2bool ft) . snd)
$ map (\f -> (f, detectDefaultWithPriors text priors f)) docs_test
pure results
---------------------------------------------------------------------------
fav2bool :: FavOrTrash -> Bool
fav2bool ft = if (==) ft IsFav then True else False
text :: FacetDoc -> Text
text (FacetDoc _ _ _ h _ _) = title <> "" <> Text.take 100 abstr
where
title = maybe "" identity (_hyperdataDocument_title h)
abstr = maybe "" identity (_hyperdataDocument_abstract h)
---------------------------------------------------------------------------
apply :: (FlowCmdM DevEnv GargError m) => FavOrTrash -> CorpusId -> [NodeId] -> m [Int]
apply favTrash cId ns = case favTrash of
IsFav -> nodesToFavorite $ map (\n -> (cId, n, True)) ns
IsTrash -> delDocs cId (Documents ns)
learnAndApply :: HasNodeError err => FavTrash -> CorpusId -> Cmd err [Int]
learnAndApply ft cId = do
ids <- map fst <$> moreLike ft cId
learnModify ft cId ids
moreLikeAndApply :: FlowCmdM DevEnv GargError m => FavOrTrash -> CorpusId -> m [Int]
moreLikeAndApply ft cId = do
priors <- getPriors ft cId
moreLikeWithAndApply priors ft cId
moreLikeWithAndApply :: FlowCmdM DevEnv GargError m => Events Bool -> FavOrTrash -> CorpusId -> m [Int]
moreLikeWithAndApply priors ft cId = do
ids <- map facetDoc_id <$> moreLikeWith priors ft cId
apply ft cId ids
......@@ -304,3 +304,6 @@ tail' = listSafeN "tail" tailMay
init' :: Text -> [a] -> [a]
init' = listSafeN "init" initMay
------------------------------------------------------------------------
......@@ -54,9 +54,9 @@ class ReadFile a where
readFile' :: FilePath -> IO a
saveFile :: (MonadReader env m, MonadIO m, HasSettings env, SaveFile a)
writeFile :: (MonadReader env m, MonadIO m, HasSettings env, SaveFile a)
=> a -> m FilePath
saveFile a = do
writeFile a = do
dataPath <- view (settings . fileFolder) <$> ask
(fp,fn) <- liftIO $ (toPath 3) . hash . Text.pack . show <$> newStdGen
......
......@@ -19,18 +19,17 @@ TODO: create a separate Lib.
module Gargantext.Text.Corpus.Parsers.GrandDebat
where
import GHC.IO (FilePath)
import Data.Aeson (ToJSON, FromJSON)
import qualified Data.JsonStream.Parser as P
--import Data.Either (either)
import Data.Maybe (Maybe())
import Data.Text (Text)
import qualified Data.Text as Text
import qualified Data.ByteString.Lazy as DBL
import GHC.Generics (Generic)
import Gargantext.Prelude
import Gargantext.Database.Types.Node
import Gargantext.Core (Lang(..))
import Gargantext.Database.Types.Node
import Gargantext.Prelude
import Gargantext.Prelude.Utils
import qualified Data.ByteString.Lazy as DBL
import qualified Data.JsonStream.Parser as P
import qualified Data.Text as Text
data GrandDebatReference = GrandDebatReference
......@@ -91,16 +90,12 @@ instance ToHyperdataDocument GrandDebatReference
True -> r'
False -> ""
class ReadFile a
where
readFile :: FilePath -> IO a
instance ReadFile [GrandDebatReference]
where
-- | read json: 3 version below are working but with increased optimization
--readFile fp = maybe [] identity <$> decode <$> DBL.readFile fp
--readFile fp = either (panic . Text.pack) identity <$> P.eitherDecode <$> DBL.readFile fp
readFile fp = P.parseLazyByteString (P.arrayOf P.value) <$> DBL.readFile fp
readFile' fp = P.parseLazyByteString (P.arrayOf P.value) <$> DBL.readFile fp
......@@ -18,24 +18,30 @@ Main type here is String.
{-# LANGUAGE NoImplicitPrelude #-}
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE DeriveGeneric #-}
{-# LANGUAGE TypeSynonymInstances #-}
{-# LANGUAGE FlexibleInstances #-}
module Gargantext.Text.Learn -- (detectLang, detectLangs, stopList)
where
--import Data.Char (toLower)
import Codec.Serialise
import qualified Data.List as DL
import Data.Maybe (maybe)
import Data.Map.Strict (Map, toList)
import qualified Data.Map.Strict as DM
import GHC.Generics
import Data.String (String)
import Data.Text (Text)
import Data.Text (pack, unpack, toLower)
import Data.Tuple.Extra (both)
import qualified Data.ByteString.Lazy as BSL
import Gargantext.Prelude
import Gargantext.Prelude.Utils
import Gargantext.Core (Lang(..), allLangs)
import Gargantext.Text.Terms.Mono (words)
import Gargantext.Text.Metrics.Count (occurrencesWith)
......@@ -172,7 +178,16 @@ toEvents e ns n = foldl' (opEvent (+)) (emptyEvent e ns n) . map (toEvent ns n)
data EventBook = EventBook { events_freq :: Map String Freq
, events_n :: Map StringSize TotalFreq
}
deriving (Show)
deriving (Show, Generic)
instance Serialise EventBook
instance (Serialise a, Ord a) => SaveFile (Events a) where
saveFile' f d = BSL.writeFile f (serialise d)
instance (Serialise a, Ord a) => ReadFile (Events a) where
readFile' filepath = deserialise <$> BSL.readFile filepath
emptyEventBook :: [Int] -> Int -> EventBook
emptyEventBook ns n = wordToBook ns n " "
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment