Unverified Commit 45cddcc1 authored by Nicolas Pouillard's avatar Nicolas Pouillard
parents 863694e6 8f0b7cd4
......@@ -19,10 +19,11 @@ Import a corpus binary.
module Main where
import Prelude (read)
import Control.Exception (finally)
import Servant (ServantErr)
import Gargantext.Prelude
import Gargantext.Database.Flow (FlowCmdM, flowCorpus'')
import Gargantext.Database.Flow (FlowCmdM, flowCorpus)
import Gargantext.Text.Parsers (FileFormat(CsvHalFormat))
import Gargantext.Database.Utils (Cmd, )
import Gargantext.Database.Types.Node (CorpusId)
......@@ -38,27 +39,38 @@ import Control.Monad.IO.Class (liftIO)
main :: IO ()
main = do
[user, iniPath, name, corpusPath] <- getArgs
[userCreate, user, name, iniPath, limit, corpusPath] <- getArgs
--{-
let createUsers :: Cmd ServantErr Int64
createUsers = insertUsersDemo
{-
let cmdCorpus :: forall m. FlowCmdM DevEnv ServantErr m => m CorpusId
cmdCorpus = flowCorpus (cs user) (cs name) (Mono EN) CsvHalFormat corpusPath
let csvCorpus :: forall m. FlowCmdM DevEnv ServantErr m => m CorpusId
csvCorpus = flowCorpus (cs user) (cs name) (Multi EN) CsvHalFormat corpusPath
--}
let cmdCorpus :: forall m. FlowCmdM DevEnv ServantErr m => m [CorpusId]
cmdCorpus = do
docs <- liftIO (splitEvery 3000 <$> readFile corpusPath :: IO [[GrandDebatReference ]])
ids <- flowCorpus'' (Text.pack user) (Text.pack name) (Mono FR) docs
pure ids
let debatCorpus :: forall m. FlowCmdM DevEnv ServantErr m => m CorpusId
debatCorpus = do
docs <- liftIO ( splitEvery 500
<$> take (read limit :: Int)
<$> readFile corpusPath
:: IO [[GrandDebatReference ]]
)
flowCorpus (Text.pack user) (Text.pack name) (Multi FR) docs
-- cmd = {-createUsers >>-} cmdCorpus
env <- newDevEnvWith iniPath
-- Better if we keep only one call to runCmdDev.
_ <- runCmdDev env createUsers
_ <- runCmdDev env cmdCorpus
_ <- if userCreate == "true"
then runCmdDev env createUsers
else pure 0 --(cs "false")
_ <- runCmdDev env debatCorpus
{-
_ <- if corpusType == "csv"
then runCmdDev env csvCorpus
else if corpusType == "debat"
then runCmdDev env debatCorpus
else panic "corpusType unknown: try \"csv\" or \"debat\""
-}
pure ()
......@@ -32,6 +32,7 @@ module Gargantext.Database.Flow -- (flowDatabase, ngrams2list)
--import Gargantext.Database.Schema.User (insertUsers, simpleUser, gargantuaUser)
--import Gargantext.Ext.IMTUser (deserialiseImtUsersFromFile)
--import Gargantext.Text.Metrics.TFICF (Tficf(..))
--import Debug.Trace (trace)
import Control.Monad (mapM_)
import Control.Monad.IO.Class (liftIO)
import Data.List (concat)
......@@ -58,11 +59,12 @@ import Gargantext.Database.Types.Node -- (HyperdataDocument(..), NodeType(..), N
import Gargantext.Database.Utils (Cmd, CmdM)
import Gargantext.Ext.IMT (toSchoolName)
import Gargantext.Prelude
--import Gargantext.Text.List
import Gargantext.Text.Parsers (parseDocs, FileFormat)
import Gargantext.Text.List (buildNgramsLists)
--import Gargantext.Text.Parsers (parseDocs, FileFormat)
import Gargantext.Text.Terms (TermType(..))
import Gargantext.Text.Terms (extractTerms)
import Gargantext.Text.Terms.Mono.Stem.En (stemIt)
import qualified Gargantext.Text.Parsers.GrandDebat as GD
import Servant (ServantErr)
import System.FilePath (FilePath)
import qualified Data.Map as DM
......@@ -76,21 +78,33 @@ type FlowCmdM env err m =
, HasRepoVar env
)
flowCorpusDebat :: FlowCmdM env ServantErr m
=> Username -> CorpusName -> Int -> FilePath -> m CorpusId
flowCorpusDebat u n l fp = do
docs <- liftIO ( splitEvery 500
<$> take l
<$> GD.readFile fp
:: IO [[GD.GrandDebatReference ]]
)
flowCorpus u n (Multi FR) docs
{-
flowCorpus :: FlowCmdM env ServantErr m
=> Username -> CorpusName -> TermType Lang -> FileFormat -> FilePath -> m CorpusId
flowCorpus u cn la ff fp = liftIO (parseDocs ff fp) >>= \docs -> flowCorpus' u cn la docs
flowCorpus u cn la ff fp = undefined -- liftIO (parseDocs ff fp) >>= \docs -> flowCorpus' u cn la docs
--{-
flowCorpus'' :: (FlowCmdM env ServantErr m, ToHyperdataDocument a)
flowCorpus''' :: (FlowCmdM env ServantErr m, ToHyperdataDocument a)
=> Username -> CorpusName -> TermType Lang -> [[a]] -> m [CorpusId]
flowCorpus'' u cn la docs = mapM (\doc -> flowCorpus' u cn la doc) docs
flowCorpus''' u cn la docs = mapM (\doc -> flowCorpus' u cn la doc) docs
--}
flowCorpus' :: (FlowCmdM env ServantErr m, ToHyperdataDocument a)
=> Username -> CorpusName -> TermType Lang -> [a] -> m CorpusId
flowCorpus' u cn la docs = do
ids <- flowCorpusMaster la (map toHyperdataDocument docs)
flowCorpusUser u cn ids
flowCorpus :: (FlowCmdM env ServantErr m, ToHyperdataDocument a)
=> Username -> CorpusName -> TermType Lang -> [[a]] -> m CorpusId
flowCorpus u cn la docs = do
ids <- mapM ((insertMasterDocs la) . (map toHyperdataDocument)) docs
flowCorpusUser FR u cn (concat ids)
-- TODO query with complex query
......@@ -99,29 +113,25 @@ flowCorpusSearchInDatabase :: FlowCmdM env ServantErr m
flowCorpusSearchInDatabase u q = do
(_masterUserId, _masterRootId, cId) <- getOrMkRootWithCorpus userMaster ""
ids <- map fst <$> searchInDatabase cId (stemIt q)
flowCorpusUser u q ids
-- TODO uniformize language of corpus
flowCorpusMaster :: FlowCmdM env ServantErr m => TermType Lang -> [HyperdataDocument] -> m [NodeId]
flowCorpusMaster la hd = (insertMasterDocs la) $ (map addUniqIdsDoc) hd
flowCorpusUser FR u q ids
flowCorpusUser :: FlowCmdM env ServantErr m => Username -> CorpusName -> [NodeId] -> m CorpusId
flowCorpusUser userName corpusName ids = do
flowCorpusUser :: FlowCmdM env ServantErr m
=> Lang -> Username -> CorpusName -> [NodeId] -> m CorpusId
flowCorpusUser l userName corpusName ids = do
-- User Flow
(_userId, _rootId, userCorpusId) <- getOrMkRootWithCorpus userName corpusName
(userId, _rootId, userCorpusId) <- getOrMkRootWithCorpus userName corpusName
-- TODO: check if present already, ignore
_ <- Doc.add userCorpusId ids
-- User List Flow
--(_masterUserId, _masterRootId, masterCorpusId) <- getOrMkRootWithCorpus userMaster ""
--ngs <- buildNgramsLists userCorpusId masterCorpusId
--userListId <- flowList userId userCorpusId ngs
--printDebug "userListId" userListId
(_masterUserId, _masterRootId, masterCorpusId) <- getOrMkRootWithCorpus userMaster ""
ngs <- buildNgramsLists l 2 3 userCorpusId masterCorpusId
userListId <- flowList userId userCorpusId ngs
printDebug "userListId" userListId
-- User Graph Flow
--_ <- mkGraph userCorpusId userId
_ <- mkGraph userCorpusId userId
-- User Dashboard Flow
-- _ <- mkDashboard userCorpusId userId
......@@ -134,20 +144,23 @@ flowCorpusUser userName corpusName ids = do
insertMasterDocs :: FlowCmdM env ServantErr m
=> TermType Lang -> [HyperdataDocument] -> m [DocId]
insertMasterDocs lang hs = do
let hyperdataDocuments' = map (\h -> ToDbDocument h) hs
(masterUserId, _, masterCorpusId) <- getOrMkRootWithCorpus userMaster corpusMasterName
ids <- insertDocuments masterUserId masterCorpusId NodeDocument hyperdataDocuments'
let documentsWithId = mergeData (toInserted ids) (toInsert hs)
-- TODO Type NodeDocumentUnicised
let hs' = map addUniqIdsDoc hs
ids <- insertDocuments masterUserId masterCorpusId NodeDocument
$ map ToDbDocument hs'
-- ^ TODO Type class to insert Doc
-- ^ TODO Type Class AddUnicity where unicity = addUnicity
let documentsWithId = mergeData (toInserted ids) (toInsert hs')
docsWithNgrams <- documentIdWithNgrams (extractNgramsT lang) documentsWithId
let maps = mapNodeIdNgrams docsWithNgrams
terms2id <- insertNgrams $ DM.keys maps
let indexedNgrams = DM.mapKeys (indexNgrams terms2id) maps
_ <- insertToNodeNgrams indexedNgrams
_ <- insertToNodeNgrams indexedNgrams
pure $ map reId ids
......@@ -220,51 +233,50 @@ data DocumentIdWithNgrams = DocumentIdWithNgrams
, document_ngrams :: !(Map Ngrams (Map NgramsType Int))
} deriving (Show)
-- TODO extractNgrams according to Type of Data
extractNgramsT :: HasNodeError err
=> TermType Lang -> HyperdataDocument
-> Cmd err (Map Ngrams (Map NgramsType Int))
extractNgramsT lang hd = filterNgramsT 255 <$> extractNgramsT' lang hd
extractNgramsT' :: HasNodeError err
=> TermType Lang -> HyperdataDocument
-> Cmd err (Map Ngrams (Map NgramsType Int))
extractNgramsT' lang doc = do
let source = text2ngrams
$ maybe "Nothing" identity
$ _hyperdataDocument_source doc
institutes = map text2ngrams
$ maybe ["Nothing"] (map toSchoolName . (splitOn ", "))
$ _hyperdataDocument_institutes doc
authors = map text2ngrams
$ maybe ["Nothing"] (splitOn ", ")
$ _hyperdataDocument_authors doc
leText = catMaybes [ _hyperdataDocument_title doc
, _hyperdataDocument_abstract doc
]
terms' <- map text2ngrams
<$> map (intercalate " " . _terms_label)
<$> concat
<$> liftIO (extractTerms lang leText)
pure $ DM.fromList $ [(source, DM.singleton Sources 1)]
<> [(i', DM.singleton Institutes 1) | i' <- institutes ]
<> [(a', DM.singleton Authors 1) | a' <- authors ]
<> [(t', DM.singleton NgramsTerms 1) | t' <- terms' ]
filterNgramsT :: Int -> Map Ngrams (Map NgramsType Int)
-> Map Ngrams (Map NgramsType Int)
filterNgramsT s ms = DM.fromList $ map (\a -> filter' s a) $ DM.toList ms
where
filter' s' (ng@(Ngrams t n),y) = case (Text.length t) < s' of
True -> (ng,y)
False -> (Ngrams (Text.take s' t) n , y)
extractNgramsT' :: HasNodeError err
=> TermType Lang -> HyperdataDocument
-> Cmd err (Map Ngrams (Map NgramsType Int))
extractNgramsT' lang' doc = do
let source = text2ngrams
$ maybe "Nothing" identity
$ _hyperdataDocument_source doc
institutes = map text2ngrams
$ maybe ["Nothing"] (map toSchoolName . (splitOn ", "))
$ _hyperdataDocument_institutes doc
authors = map text2ngrams
$ maybe ["Nothing"] (splitOn ", ")
$ _hyperdataDocument_authors doc
leText = catMaybes [ _hyperdataDocument_title doc
, _hyperdataDocument_abstract doc
]
terms' <- map text2ngrams
<$> map (intercalate " " . _terms_label)
<$> concat
<$> liftIO (extractTerms lang' leText)
pure $ DM.fromList $ [(source, DM.singleton Sources 1)]
<> [(i', DM.singleton Institutes 1) | i' <- institutes ]
<> [(a', DM.singleton Authors 1) | a' <- authors ]
<> [(t', DM.singleton NgramsTerms 1) | t' <- terms' ]
filterNgramsT :: Int -> Map Ngrams (Map NgramsType Int)
-> Map Ngrams (Map NgramsType Int)
filterNgramsT s ms = DM.fromList $ map (\a -> filter' s a) $ DM.toList ms
where
filter' s' (ng@(Ngrams t n),y) = case (Text.length t) < s' of
True -> (ng,y)
False -> (Ngrams (Text.take s' t) n , y)
documentIdWithNgrams :: HasNodeError err
......
......@@ -44,13 +44,13 @@ import qualified Database.PostgreSQL.Simple as DPS
-- a first grouping option to user and get some
-- enriched data to better learn and improve that algo
ngramsGroup :: Lang -> Int -> Int -> Text -> Text
ngramsGroup l m n = Text.intercalate " "
. map (stem l)
. take n
. List.sort
. (List.filter (\t -> Text.length t > m))
. Text.splitOn " "
. Text.replace "-" " "
ngramsGroup l _m _n = Text.intercalate " "
. map (stem l)
-- . take n
. List.sort
-- . (List.filter (\t -> Text.length t > m))
. Text.splitOn " "
. Text.replace "-" " "
sortTficf :: (Map Text (Double, Set Text))
......@@ -171,8 +171,6 @@ queryNgramsOnlyByNodeUser = [sql|
GROUP BY nng.node_id, ng.terms
|]
------------------------------------------------------------------------
-- | TODO filter by language, database, any social field
getNodesByNgramsMaster :: UserCorpusId -> MasterCorpusId -> Cmd err (Map Text (Set NodeId))
......@@ -183,10 +181,10 @@ selectNgramsByNodeMaster :: UserCorpusId -> MasterCorpusId -> Cmd err [(NodeId,
selectNgramsByNodeMaster ucId mcId = runPGSQuery
queryNgramsByNodeMaster
( ucId
, nodeTypeId NodeDocument
, nodeTypeId NodeDocument
, ngramsTypeId NgramsTerms
, mcId
, nodeTypeId NodeDocument
, nodeTypeId NodeDocument
, ngramsTypeId NgramsTerms
)
......
......@@ -26,6 +26,7 @@ import Gargantext.Database.Metrics.NgramsByNode (getTficf', sortTficf, ngramsGro
import Gargantext.Database.Schema.Ngrams (NgramsType(..))
import Gargantext.Database.Utils (Cmd)
import Gargantext.Prelude
--import Gargantext.Text.Terms (TermType(..))
import qualified Data.Char as Char
import qualified Data.List as List
import qualified Data.Map as Map
......@@ -33,10 +34,10 @@ import qualified Data.Set as Set
import qualified Data.Text as Text
-- | TODO improve grouping functions of Authors, Sources, Institutes..
buildNgramsLists :: UserCorpusId -> MasterCorpusId
buildNgramsLists :: Lang -> Int -> Int -> UserCorpusId -> MasterCorpusId
-> Cmd err (Map NgramsType [NgramsElement])
buildNgramsLists uCid mCid = do
ngTerms <- buildNgramsTermsList uCid mCid
buildNgramsLists l n m uCid mCid = do
ngTerms <- buildNgramsTermsList l n m uCid mCid
othersTerms <- mapM (buildNgramsOthersList uCid identity) [Authors, Sources, Institutes]
pure $ Map.unions $ othersTerms <> [ngTerms]
......@@ -53,13 +54,14 @@ buildNgramsOthersList uCid groupIt nt = do
]
-- TODO remove hard coded parameters
buildNgramsTermsList :: UserCorpusId -> MasterCorpusId
buildNgramsTermsList :: Lang -> Int -> Int -> UserCorpusId -> MasterCorpusId
-> Cmd err (Map NgramsType [NgramsElement])
buildNgramsTermsList uCid mCid = do
candidates <- sortTficf <$> getTficf' uCid mCid (ngramsGroup EN 4 2)
buildNgramsTermsList l n m uCid mCid = do
candidates <- sortTficf <$> getTficf' uCid mCid (ngramsGroup l n m)
--printDebug "candidate" (length candidates)
let termList = toTermList (isStopTerm . fst) candidates
--let termList = toTermList (isStopTerm . fst) candidates
let termList = toTermList ((\_ -> False) . fst) candidates
--printDebug "termlist" (length termList)
let ngs = List.concat $ map toNgramsElement termList
......@@ -98,14 +100,14 @@ toTermList stop ns = map (toTermList' stop CandidateTerm) xs
ys = take b $ drop a ns
zs = drop b $ drop a ns
a = 50
b = 1000
a = 10
b = 400
isStopTerm :: Text -> Bool
isStopTerm x = Text.length x < 3
|| not (all Char.isAlpha (Text.unpack x'))
where
x' = foldl (\t -> Text.replace t "")
x' = foldl (\t -> Text.replace t "a")
x
["-"," ","/","(",")"]
......@@ -97,6 +97,7 @@ class ReadFile a
instance ReadFile [GrandDebatReference]
where
-- | read json: 3 version below are working but with increased optimization
--readFile fp = maybe [] identity <$> decode <$> DBL.readFile fp
--readFile fp = either (panic . Text.pack) identity <$> P.eitherDecode <$> DBL.readFile fp
readFile fp = P.parseLazyByteString (P.arrayOf P.value) <$> DBL.readFile fp
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment