Unverified Commit 45cddcc1 authored by Nicolas Pouillard's avatar Nicolas Pouillard
parents 863694e6 8f0b7cd4
...@@ -19,10 +19,11 @@ Import a corpus binary. ...@@ -19,10 +19,11 @@ Import a corpus binary.
module Main where module Main where
import Prelude (read)
import Control.Exception (finally) import Control.Exception (finally)
import Servant (ServantErr) import Servant (ServantErr)
import Gargantext.Prelude import Gargantext.Prelude
import Gargantext.Database.Flow (FlowCmdM, flowCorpus'') import Gargantext.Database.Flow (FlowCmdM, flowCorpus)
import Gargantext.Text.Parsers (FileFormat(CsvHalFormat)) import Gargantext.Text.Parsers (FileFormat(CsvHalFormat))
import Gargantext.Database.Utils (Cmd, ) import Gargantext.Database.Utils (Cmd, )
import Gargantext.Database.Types.Node (CorpusId) import Gargantext.Database.Types.Node (CorpusId)
...@@ -38,27 +39,38 @@ import Control.Monad.IO.Class (liftIO) ...@@ -38,27 +39,38 @@ import Control.Monad.IO.Class (liftIO)
main :: IO () main :: IO ()
main = do main = do
[user, iniPath, name, corpusPath] <- getArgs [userCreate, user, name, iniPath, limit, corpusPath] <- getArgs
--{- --{-
let createUsers :: Cmd ServantErr Int64 let createUsers :: Cmd ServantErr Int64
createUsers = insertUsersDemo createUsers = insertUsersDemo
{- {-
let cmdCorpus :: forall m. FlowCmdM DevEnv ServantErr m => m CorpusId let csvCorpus :: forall m. FlowCmdM DevEnv ServantErr m => m CorpusId
cmdCorpus = flowCorpus (cs user) (cs name) (Mono EN) CsvHalFormat corpusPath csvCorpus = flowCorpus (cs user) (cs name) (Multi EN) CsvHalFormat corpusPath
--} --}
let cmdCorpus :: forall m. FlowCmdM DevEnv ServantErr m => m [CorpusId] let debatCorpus :: forall m. FlowCmdM DevEnv ServantErr m => m CorpusId
cmdCorpus = do debatCorpus = do
docs <- liftIO (splitEvery 3000 <$> readFile corpusPath :: IO [[GrandDebatReference ]]) docs <- liftIO ( splitEvery 500
ids <- flowCorpus'' (Text.pack user) (Text.pack name) (Mono FR) docs <$> take (read limit :: Int)
pure ids <$> readFile corpusPath
:: IO [[GrandDebatReference ]]
)
flowCorpus (Text.pack user) (Text.pack name) (Multi FR) docs
-- cmd = {-createUsers >>-} cmdCorpus
env <- newDevEnvWith iniPath env <- newDevEnvWith iniPath
-- Better if we keep only one call to runCmdDev. -- Better if we keep only one call to runCmdDev.
_ <- runCmdDev env createUsers _ <- if userCreate == "true"
_ <- runCmdDev env cmdCorpus then runCmdDev env createUsers
else pure 0 --(cs "false")
_ <- runCmdDev env debatCorpus
{-
_ <- if corpusType == "csv"
then runCmdDev env csvCorpus
else if corpusType == "debat"
then runCmdDev env debatCorpus
else panic "corpusType unknown: try \"csv\" or \"debat\""
-}
pure () pure ()
...@@ -32,6 +32,7 @@ module Gargantext.Database.Flow -- (flowDatabase, ngrams2list) ...@@ -32,6 +32,7 @@ module Gargantext.Database.Flow -- (flowDatabase, ngrams2list)
--import Gargantext.Database.Schema.User (insertUsers, simpleUser, gargantuaUser) --import Gargantext.Database.Schema.User (insertUsers, simpleUser, gargantuaUser)
--import Gargantext.Ext.IMTUser (deserialiseImtUsersFromFile) --import Gargantext.Ext.IMTUser (deserialiseImtUsersFromFile)
--import Gargantext.Text.Metrics.TFICF (Tficf(..)) --import Gargantext.Text.Metrics.TFICF (Tficf(..))
--import Debug.Trace (trace)
import Control.Monad (mapM_) import Control.Monad (mapM_)
import Control.Monad.IO.Class (liftIO) import Control.Monad.IO.Class (liftIO)
import Data.List (concat) import Data.List (concat)
...@@ -58,11 +59,12 @@ import Gargantext.Database.Types.Node -- (HyperdataDocument(..), NodeType(..), N ...@@ -58,11 +59,12 @@ import Gargantext.Database.Types.Node -- (HyperdataDocument(..), NodeType(..), N
import Gargantext.Database.Utils (Cmd, CmdM) import Gargantext.Database.Utils (Cmd, CmdM)
import Gargantext.Ext.IMT (toSchoolName) import Gargantext.Ext.IMT (toSchoolName)
import Gargantext.Prelude import Gargantext.Prelude
--import Gargantext.Text.List import Gargantext.Text.List (buildNgramsLists)
import Gargantext.Text.Parsers (parseDocs, FileFormat) --import Gargantext.Text.Parsers (parseDocs, FileFormat)
import Gargantext.Text.Terms (TermType(..)) import Gargantext.Text.Terms (TermType(..))
import Gargantext.Text.Terms (extractTerms) import Gargantext.Text.Terms (extractTerms)
import Gargantext.Text.Terms.Mono.Stem.En (stemIt) import Gargantext.Text.Terms.Mono.Stem.En (stemIt)
import qualified Gargantext.Text.Parsers.GrandDebat as GD
import Servant (ServantErr) import Servant (ServantErr)
import System.FilePath (FilePath) import System.FilePath (FilePath)
import qualified Data.Map as DM import qualified Data.Map as DM
...@@ -76,21 +78,33 @@ type FlowCmdM env err m = ...@@ -76,21 +78,33 @@ type FlowCmdM env err m =
, HasRepoVar env , HasRepoVar env
) )
flowCorpusDebat :: FlowCmdM env ServantErr m
=> Username -> CorpusName -> Int -> FilePath -> m CorpusId
flowCorpusDebat u n l fp = do
docs <- liftIO ( splitEvery 500
<$> take l
<$> GD.readFile fp
:: IO [[GD.GrandDebatReference ]]
)
flowCorpus u n (Multi FR) docs
{-
flowCorpus :: FlowCmdM env ServantErr m flowCorpus :: FlowCmdM env ServantErr m
=> Username -> CorpusName -> TermType Lang -> FileFormat -> FilePath -> m CorpusId => Username -> CorpusName -> TermType Lang -> FileFormat -> FilePath -> m CorpusId
flowCorpus u cn la ff fp = liftIO (parseDocs ff fp) >>= \docs -> flowCorpus' u cn la docs flowCorpus u cn la ff fp = undefined -- liftIO (parseDocs ff fp) >>= \docs -> flowCorpus' u cn la docs
--{- flowCorpus''' :: (FlowCmdM env ServantErr m, ToHyperdataDocument a)
flowCorpus'' :: (FlowCmdM env ServantErr m, ToHyperdataDocument a)
=> Username -> CorpusName -> TermType Lang -> [[a]] -> m [CorpusId] => Username -> CorpusName -> TermType Lang -> [[a]] -> m [CorpusId]
flowCorpus'' u cn la docs = mapM (\doc -> flowCorpus' u cn la doc) docs flowCorpus''' u cn la docs = mapM (\doc -> flowCorpus' u cn la doc) docs
--} --}
flowCorpus' :: (FlowCmdM env ServantErr m, ToHyperdataDocument a) flowCorpus :: (FlowCmdM env ServantErr m, ToHyperdataDocument a)
=> Username -> CorpusName -> TermType Lang -> [a] -> m CorpusId => Username -> CorpusName -> TermType Lang -> [[a]] -> m CorpusId
flowCorpus' u cn la docs = do flowCorpus u cn la docs = do
ids <- flowCorpusMaster la (map toHyperdataDocument docs) ids <- mapM ((insertMasterDocs la) . (map toHyperdataDocument)) docs
flowCorpusUser u cn ids flowCorpusUser FR u cn (concat ids)
-- TODO query with complex query -- TODO query with complex query
...@@ -99,29 +113,25 @@ flowCorpusSearchInDatabase :: FlowCmdM env ServantErr m ...@@ -99,29 +113,25 @@ flowCorpusSearchInDatabase :: FlowCmdM env ServantErr m
flowCorpusSearchInDatabase u q = do flowCorpusSearchInDatabase u q = do
(_masterUserId, _masterRootId, cId) <- getOrMkRootWithCorpus userMaster "" (_masterUserId, _masterRootId, cId) <- getOrMkRootWithCorpus userMaster ""
ids <- map fst <$> searchInDatabase cId (stemIt q) ids <- map fst <$> searchInDatabase cId (stemIt q)
flowCorpusUser u q ids flowCorpusUser FR u q ids
-- TODO uniformize language of corpus
flowCorpusMaster :: FlowCmdM env ServantErr m => TermType Lang -> [HyperdataDocument] -> m [NodeId]
flowCorpusMaster la hd = (insertMasterDocs la) $ (map addUniqIdsDoc) hd
flowCorpusUser :: FlowCmdM env ServantErr m => Username -> CorpusName -> [NodeId] -> m CorpusId flowCorpusUser :: FlowCmdM env ServantErr m
flowCorpusUser userName corpusName ids = do => Lang -> Username -> CorpusName -> [NodeId] -> m CorpusId
flowCorpusUser l userName corpusName ids = do
-- User Flow -- User Flow
(_userId, _rootId, userCorpusId) <- getOrMkRootWithCorpus userName corpusName (userId, _rootId, userCorpusId) <- getOrMkRootWithCorpus userName corpusName
-- TODO: check if present already, ignore -- TODO: check if present already, ignore
_ <- Doc.add userCorpusId ids _ <- Doc.add userCorpusId ids
-- User List Flow -- User List Flow
--(_masterUserId, _masterRootId, masterCorpusId) <- getOrMkRootWithCorpus userMaster "" (_masterUserId, _masterRootId, masterCorpusId) <- getOrMkRootWithCorpus userMaster ""
--ngs <- buildNgramsLists userCorpusId masterCorpusId ngs <- buildNgramsLists l 2 3 userCorpusId masterCorpusId
--userListId <- flowList userId userCorpusId ngs userListId <- flowList userId userCorpusId ngs
--printDebug "userListId" userListId printDebug "userListId" userListId
-- User Graph Flow -- User Graph Flow
--_ <- mkGraph userCorpusId userId _ <- mkGraph userCorpusId userId
-- User Dashboard Flow -- User Dashboard Flow
-- _ <- mkDashboard userCorpusId userId -- _ <- mkDashboard userCorpusId userId
...@@ -134,20 +144,23 @@ flowCorpusUser userName corpusName ids = do ...@@ -134,20 +144,23 @@ flowCorpusUser userName corpusName ids = do
insertMasterDocs :: FlowCmdM env ServantErr m insertMasterDocs :: FlowCmdM env ServantErr m
=> TermType Lang -> [HyperdataDocument] -> m [DocId] => TermType Lang -> [HyperdataDocument] -> m [DocId]
insertMasterDocs lang hs = do insertMasterDocs lang hs = do
let hyperdataDocuments' = map (\h -> ToDbDocument h) hs
(masterUserId, _, masterCorpusId) <- getOrMkRootWithCorpus userMaster corpusMasterName (masterUserId, _, masterCorpusId) <- getOrMkRootWithCorpus userMaster corpusMasterName
ids <- insertDocuments masterUserId masterCorpusId NodeDocument hyperdataDocuments'
let documentsWithId = mergeData (toInserted ids) (toInsert hs) -- TODO Type NodeDocumentUnicised
let hs' = map addUniqIdsDoc hs
ids <- insertDocuments masterUserId masterCorpusId NodeDocument
$ map ToDbDocument hs'
-- ^ TODO Type class to insert Doc
-- ^ TODO Type Class AddUnicity where unicity = addUnicity
let documentsWithId = mergeData (toInserted ids) (toInsert hs')
docsWithNgrams <- documentIdWithNgrams (extractNgramsT lang) documentsWithId docsWithNgrams <- documentIdWithNgrams (extractNgramsT lang) documentsWithId
let maps = mapNodeIdNgrams docsWithNgrams let maps = mapNodeIdNgrams docsWithNgrams
terms2id <- insertNgrams $ DM.keys maps terms2id <- insertNgrams $ DM.keys maps
let indexedNgrams = DM.mapKeys (indexNgrams terms2id) maps let indexedNgrams = DM.mapKeys (indexNgrams terms2id) maps
_ <- insertToNodeNgrams indexedNgrams _ <- insertToNodeNgrams indexedNgrams
pure $ map reId ids pure $ map reId ids
...@@ -220,51 +233,50 @@ data DocumentIdWithNgrams = DocumentIdWithNgrams ...@@ -220,51 +233,50 @@ data DocumentIdWithNgrams = DocumentIdWithNgrams
, document_ngrams :: !(Map Ngrams (Map NgramsType Int)) , document_ngrams :: !(Map Ngrams (Map NgramsType Int))
} deriving (Show) } deriving (Show)
-- TODO extractNgrams according to Type of Data
extractNgramsT :: HasNodeError err extractNgramsT :: HasNodeError err
=> TermType Lang -> HyperdataDocument => TermType Lang -> HyperdataDocument
-> Cmd err (Map Ngrams (Map NgramsType Int)) -> Cmd err (Map Ngrams (Map NgramsType Int))
extractNgramsT lang hd = filterNgramsT 255 <$> extractNgramsT' lang hd extractNgramsT lang hd = filterNgramsT 255 <$> extractNgramsT' lang hd
extractNgramsT' :: HasNodeError err
=> TermType Lang -> HyperdataDocument
-> Cmd err (Map Ngrams (Map NgramsType Int))
extractNgramsT' lang doc = do
let source = text2ngrams
$ maybe "Nothing" identity
$ _hyperdataDocument_source doc
institutes = map text2ngrams
$ maybe ["Nothing"] (map toSchoolName . (splitOn ", "))
$ _hyperdataDocument_institutes doc
authors = map text2ngrams
$ maybe ["Nothing"] (splitOn ", ")
$ _hyperdataDocument_authors doc
leText = catMaybes [ _hyperdataDocument_title doc
, _hyperdataDocument_abstract doc
]
terms' <- map text2ngrams
<$> map (intercalate " " . _terms_label)
<$> concat
<$> liftIO (extractTerms lang leText)
pure $ DM.fromList $ [(source, DM.singleton Sources 1)]
<> [(i', DM.singleton Institutes 1) | i' <- institutes ]
<> [(a', DM.singleton Authors 1) | a' <- authors ]
<> [(t', DM.singleton NgramsTerms 1) | t' <- terms' ]
filterNgramsT :: Int -> Map Ngrams (Map NgramsType Int)
-> Map Ngrams (Map NgramsType Int)
filterNgramsT s ms = DM.fromList $ map (\a -> filter' s a) $ DM.toList ms
where where
filter' s' (ng@(Ngrams t n),y) = case (Text.length t) < s' of extractNgramsT' :: HasNodeError err
True -> (ng,y) => TermType Lang -> HyperdataDocument
False -> (Ngrams (Text.take s' t) n , y) -> Cmd err (Map Ngrams (Map NgramsType Int))
extractNgramsT' lang' doc = do
let source = text2ngrams
$ maybe "Nothing" identity
$ _hyperdataDocument_source doc
institutes = map text2ngrams
$ maybe ["Nothing"] (map toSchoolName . (splitOn ", "))
$ _hyperdataDocument_institutes doc
authors = map text2ngrams
$ maybe ["Nothing"] (splitOn ", ")
$ _hyperdataDocument_authors doc
leText = catMaybes [ _hyperdataDocument_title doc
, _hyperdataDocument_abstract doc
]
terms' <- map text2ngrams
<$> map (intercalate " " . _terms_label)
<$> concat
<$> liftIO (extractTerms lang' leText)
pure $ DM.fromList $ [(source, DM.singleton Sources 1)]
<> [(i', DM.singleton Institutes 1) | i' <- institutes ]
<> [(a', DM.singleton Authors 1) | a' <- authors ]
<> [(t', DM.singleton NgramsTerms 1) | t' <- terms' ]
filterNgramsT :: Int -> Map Ngrams (Map NgramsType Int)
-> Map Ngrams (Map NgramsType Int)
filterNgramsT s ms = DM.fromList $ map (\a -> filter' s a) $ DM.toList ms
where
filter' s' (ng@(Ngrams t n),y) = case (Text.length t) < s' of
True -> (ng,y)
False -> (Ngrams (Text.take s' t) n , y)
documentIdWithNgrams :: HasNodeError err documentIdWithNgrams :: HasNodeError err
......
...@@ -44,13 +44,13 @@ import qualified Database.PostgreSQL.Simple as DPS ...@@ -44,13 +44,13 @@ import qualified Database.PostgreSQL.Simple as DPS
-- a first grouping option to user and get some -- a first grouping option to user and get some
-- enriched data to better learn and improve that algo -- enriched data to better learn and improve that algo
ngramsGroup :: Lang -> Int -> Int -> Text -> Text ngramsGroup :: Lang -> Int -> Int -> Text -> Text
ngramsGroup l m n = Text.intercalate " " ngramsGroup l _m _n = Text.intercalate " "
. map (stem l) . map (stem l)
. take n -- . take n
. List.sort . List.sort
. (List.filter (\t -> Text.length t > m)) -- . (List.filter (\t -> Text.length t > m))
. Text.splitOn " " . Text.splitOn " "
. Text.replace "-" " " . Text.replace "-" " "
sortTficf :: (Map Text (Double, Set Text)) sortTficf :: (Map Text (Double, Set Text))
...@@ -171,8 +171,6 @@ queryNgramsOnlyByNodeUser = [sql| ...@@ -171,8 +171,6 @@ queryNgramsOnlyByNodeUser = [sql|
GROUP BY nng.node_id, ng.terms GROUP BY nng.node_id, ng.terms
|] |]
------------------------------------------------------------------------ ------------------------------------------------------------------------
-- | TODO filter by language, database, any social field -- | TODO filter by language, database, any social field
getNodesByNgramsMaster :: UserCorpusId -> MasterCorpusId -> Cmd err (Map Text (Set NodeId)) getNodesByNgramsMaster :: UserCorpusId -> MasterCorpusId -> Cmd err (Map Text (Set NodeId))
...@@ -183,10 +181,10 @@ selectNgramsByNodeMaster :: UserCorpusId -> MasterCorpusId -> Cmd err [(NodeId, ...@@ -183,10 +181,10 @@ selectNgramsByNodeMaster :: UserCorpusId -> MasterCorpusId -> Cmd err [(NodeId,
selectNgramsByNodeMaster ucId mcId = runPGSQuery selectNgramsByNodeMaster ucId mcId = runPGSQuery
queryNgramsByNodeMaster queryNgramsByNodeMaster
( ucId ( ucId
, nodeTypeId NodeDocument , nodeTypeId NodeDocument
, ngramsTypeId NgramsTerms , ngramsTypeId NgramsTerms
, mcId , mcId
, nodeTypeId NodeDocument , nodeTypeId NodeDocument
, ngramsTypeId NgramsTerms , ngramsTypeId NgramsTerms
) )
......
...@@ -26,6 +26,7 @@ import Gargantext.Database.Metrics.NgramsByNode (getTficf', sortTficf, ngramsGro ...@@ -26,6 +26,7 @@ import Gargantext.Database.Metrics.NgramsByNode (getTficf', sortTficf, ngramsGro
import Gargantext.Database.Schema.Ngrams (NgramsType(..)) import Gargantext.Database.Schema.Ngrams (NgramsType(..))
import Gargantext.Database.Utils (Cmd) import Gargantext.Database.Utils (Cmd)
import Gargantext.Prelude import Gargantext.Prelude
--import Gargantext.Text.Terms (TermType(..))
import qualified Data.Char as Char import qualified Data.Char as Char
import qualified Data.List as List import qualified Data.List as List
import qualified Data.Map as Map import qualified Data.Map as Map
...@@ -33,10 +34,10 @@ import qualified Data.Set as Set ...@@ -33,10 +34,10 @@ import qualified Data.Set as Set
import qualified Data.Text as Text import qualified Data.Text as Text
-- | TODO improve grouping functions of Authors, Sources, Institutes.. -- | TODO improve grouping functions of Authors, Sources, Institutes..
buildNgramsLists :: UserCorpusId -> MasterCorpusId buildNgramsLists :: Lang -> Int -> Int -> UserCorpusId -> MasterCorpusId
-> Cmd err (Map NgramsType [NgramsElement]) -> Cmd err (Map NgramsType [NgramsElement])
buildNgramsLists uCid mCid = do buildNgramsLists l n m uCid mCid = do
ngTerms <- buildNgramsTermsList uCid mCid ngTerms <- buildNgramsTermsList l n m uCid mCid
othersTerms <- mapM (buildNgramsOthersList uCid identity) [Authors, Sources, Institutes] othersTerms <- mapM (buildNgramsOthersList uCid identity) [Authors, Sources, Institutes]
pure $ Map.unions $ othersTerms <> [ngTerms] pure $ Map.unions $ othersTerms <> [ngTerms]
...@@ -53,13 +54,14 @@ buildNgramsOthersList uCid groupIt nt = do ...@@ -53,13 +54,14 @@ buildNgramsOthersList uCid groupIt nt = do
] ]
-- TODO remove hard coded parameters -- TODO remove hard coded parameters
buildNgramsTermsList :: UserCorpusId -> MasterCorpusId buildNgramsTermsList :: Lang -> Int -> Int -> UserCorpusId -> MasterCorpusId
-> Cmd err (Map NgramsType [NgramsElement]) -> Cmd err (Map NgramsType [NgramsElement])
buildNgramsTermsList uCid mCid = do buildNgramsTermsList l n m uCid mCid = do
candidates <- sortTficf <$> getTficf' uCid mCid (ngramsGroup EN 4 2) candidates <- sortTficf <$> getTficf' uCid mCid (ngramsGroup l n m)
--printDebug "candidate" (length candidates) --printDebug "candidate" (length candidates)
let termList = toTermList (isStopTerm . fst) candidates --let termList = toTermList (isStopTerm . fst) candidates
let termList = toTermList ((\_ -> False) . fst) candidates
--printDebug "termlist" (length termList) --printDebug "termlist" (length termList)
let ngs = List.concat $ map toNgramsElement termList let ngs = List.concat $ map toNgramsElement termList
...@@ -98,14 +100,14 @@ toTermList stop ns = map (toTermList' stop CandidateTerm) xs ...@@ -98,14 +100,14 @@ toTermList stop ns = map (toTermList' stop CandidateTerm) xs
ys = take b $ drop a ns ys = take b $ drop a ns
zs = drop b $ drop a ns zs = drop b $ drop a ns
a = 50 a = 10
b = 1000 b = 400
isStopTerm :: Text -> Bool isStopTerm :: Text -> Bool
isStopTerm x = Text.length x < 3 isStopTerm x = Text.length x < 3
|| not (all Char.isAlpha (Text.unpack x')) || not (all Char.isAlpha (Text.unpack x'))
where where
x' = foldl (\t -> Text.replace t "") x' = foldl (\t -> Text.replace t "a")
x x
["-"," ","/","(",")"] ["-"," ","/","(",")"]
...@@ -97,6 +97,7 @@ class ReadFile a ...@@ -97,6 +97,7 @@ class ReadFile a
instance ReadFile [GrandDebatReference] instance ReadFile [GrandDebatReference]
where where
-- | read json: 3 version below are working but with increased optimization
--readFile fp = maybe [] identity <$> decode <$> DBL.readFile fp --readFile fp = maybe [] identity <$> decode <$> DBL.readFile fp
--readFile fp = either (panic . Text.pack) identity <$> P.eitherDecode <$> DBL.readFile fp --readFile fp = either (panic . Text.pack) identity <$> P.eitherDecode <$> DBL.readFile fp
readFile fp = P.parseLazyByteString (P.arrayOf P.value) <$> DBL.readFile fp readFile fp = P.parseLazyByteString (P.arrayOf P.value) <$> DBL.readFile fp
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment