Commit 16f3bbd8 authored by Alexandre Delanoë's avatar Alexandre Delanoë

[WORKFLOW] Unsupervised ngrams extraction implemented.

parent a5bd188c
Pipeline #485 failed with stage
...@@ -20,10 +20,12 @@ Portability : POSIX ...@@ -20,10 +20,12 @@ Portability : POSIX
{-# LANGUAGE ConstraintKinds #-} {-# LANGUAGE ConstraintKinds #-}
{-# LANGUAGE DeriveGeneric #-} {-# LANGUAGE DeriveGeneric #-}
{-# LANGUAGE FlexibleContexts #-}
{-# LANGUAGE InstanceSigs #-}
{-# LANGUAGE NoImplicitPrelude #-} {-# LANGUAGE NoImplicitPrelude #-}
{-# LANGUAGE OverloadedStrings #-} {-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE RankNTypes #-} {-# LANGUAGE RankNTypes #-}
{-# LANGUAGE FlexibleContexts #-} {-# LANGUAGE ConstrainedClassMethods #-}
module Gargantext.Database.Flow -- (flowDatabase, ngrams2list) module Gargantext.Database.Flow -- (flowDatabase, ngrams2list)
where where
...@@ -58,14 +60,14 @@ import Gargantext.Database.Utils (Cmd, CmdM) ...@@ -58,14 +60,14 @@ import Gargantext.Database.Utils (Cmd, CmdM)
import Gargantext.Ext.IMT (toSchoolName) import Gargantext.Ext.IMT (toSchoolName)
import Gargantext.Ext.IMTUser (deserialiseImtUsersFromFile) import Gargantext.Ext.IMTUser (deserialiseImtUsersFromFile)
import Gargantext.Prelude import Gargantext.Prelude
import Gargantext.Text.Terms.Eleve (buildTries, toToken)
import Gargantext.Text.List (buildNgramsLists,StopSize(..)) import Gargantext.Text.List (buildNgramsLists,StopSize(..))
import Gargantext.Text.Parsers (parseFile, FileFormat) import Gargantext.Text.Parsers (parseFile, FileFormat)
import Gargantext.Text.Terms (TermType(..), tt_lang) import Gargantext.Text.Terms (TermType(..), tt_lang, extractTerms, uniText)
import Gargantext.Text.Terms (extractTerms)
import Gargantext.Text.Terms.Mono.Stem.En (stemIt) import Gargantext.Text.Terms.Mono.Stem.En (stemIt)
import Servant (ServantErr) import Servant (ServantErr)
import System.FilePath (FilePath) import System.FilePath (FilePath)
--import qualified Data.List as List import qualified Data.List as List
import qualified Data.Map as Map import qualified Data.Map as Map
import qualified Data.Text as Text import qualified Data.Text as Text
import qualified Gargantext.Database.Node.Document.Add as Doc (add) import qualified Gargantext.Database.Node.Document.Add as Doc (add)
...@@ -82,6 +84,7 @@ type FlowCorpus a = ( AddUniqId a ...@@ -82,6 +84,7 @@ type FlowCorpus a = ( AddUniqId a
, UniqId a , UniqId a
, InsertDb a , InsertDb a
, ExtractNgramsT a , ExtractNgramsT a
, HasText a
) )
------------------------------------------------------------------------ ------------------------------------------------------------------------
...@@ -186,8 +189,17 @@ insertMasterDocs c lang hs = do ...@@ -186,8 +189,17 @@ insertMasterDocs c lang hs = do
ids <- insertDb masterUserId masterCorpusId hs' ids <- insertDb masterUserId masterCorpusId hs'
let documentsWithId = mergeData (toInserted ids) (Map.fromList $ map viewUniqId' hs') let documentsWithId = mergeData (toInserted ids) (Map.fromList $ map viewUniqId' hs')
let
fixLang (Unsupervised l n m) = Unsupervised l n m'
where
m' = case m of
Nothing -> Just $ buildTries n (fmap toToken $ uniText $ Text.intercalate " " $ List.concat $ map hasText documentsWithId)
m'' -> m''
fixLang l = l
lang' = fixLang lang
-- maps :: IO Map Ngrams (Map NgramsType (Map NodeId Int)) -- maps :: IO Map Ngrams (Map NgramsType (Map NodeId Int))
maps <- mapNodeIdNgrams <$> documentIdWithNgrams (extractNgramsT lang) documentsWithId maps <- mapNodeIdNgrams <$> documentIdWithNgrams (extractNgramsT lang') documentsWithId
terms2id <- insertNgrams $ Map.keys maps terms2id <- insertNgrams $ Map.keys maps
let indexedNgrams = Map.mapKeys (indexNgrams terms2id) maps let indexedNgrams = Map.mapKeys (indexNgrams terms2id) maps
...@@ -265,6 +277,10 @@ data DocumentWithId a = DocumentWithId ...@@ -265,6 +277,10 @@ data DocumentWithId a = DocumentWithId
, documentData :: !a , documentData :: !a
} deriving (Show) } deriving (Show)
instance HasText a => HasText (DocumentWithId a)
where
hasText (DocumentWithId _ a) = hasText a
mergeData :: Map HashId ReturnId mergeData :: Map HashId ReturnId
-> Map HashId a -> Map HashId a
-> [DocumentWithId a] -> [DocumentWithId a]
...@@ -280,12 +296,18 @@ data DocumentIdWithNgrams a = DocumentIdWithNgrams ...@@ -280,12 +296,18 @@ data DocumentIdWithNgrams a = DocumentIdWithNgrams
, document_ngrams :: !(Map Ngrams (Map NgramsType Int)) , document_ngrams :: !(Map Ngrams (Map NgramsType Int))
} deriving (Show) } deriving (Show)
-- TODO extractNgrams according to Type of Data
class ExtractNgramsT h class ExtractNgramsT h
where where
extractNgramsT :: TermType Lang -> h -> Cmd err (Map Ngrams (Map NgramsType Int)) extractNgramsT :: HasText h => TermType Lang -> h -> Cmd err (Map Ngrams (Map NgramsType Int))
class HasText h
where
hasText :: h -> [Text]
instance HasText HyperdataContact
where
hasText = undefined
instance ExtractNgramsT HyperdataContact instance ExtractNgramsT HyperdataContact
where where
...@@ -300,19 +322,20 @@ instance ExtractNgramsT HyperdataContact ...@@ -300,19 +322,20 @@ instance ExtractNgramsT HyperdataContact
pure $ Map.fromList $ [(a', Map.singleton Authors 1) | a' <- authors ] pure $ Map.fromList $ [(a', Map.singleton Authors 1) | a' <- authors ]
instance HasText HyperdataDocument
where
hasText h = catMaybes [ _hyperdataDocument_title h
, _hyperdataDocument_abstract h
]
instance ExtractNgramsT HyperdataDocument instance ExtractNgramsT HyperdataDocument
where where
extractNgramsT = extractNgramsT' extractNgramsT :: TermType Lang -> HyperdataDocument -> Cmd err (Map Ngrams (Map NgramsType Int))
extractNgramsT lang hd = filterNgramsT 255 <$> extractNgramsT' lang hd
extractNgramsT' :: TermType Lang -> HyperdataDocument
-> Cmd err (Map Ngrams (Map NgramsType Int))
extractNgramsT' lang hd = filterNgramsT 255 <$> extractNgramsT'' lang hd
where where
extractNgramsT'' :: TermType Lang -> HyperdataDocument extractNgramsT' :: TermType Lang -> HyperdataDocument
-> Cmd err (Map Ngrams (Map NgramsType Int)) -> Cmd err (Map Ngrams (Map NgramsType Int))
extractNgramsT'' lang' doc = do extractNgramsT' lang' doc = do
let source = text2ngrams let source = text2ngrams
$ maybe "Nothing" identity $ maybe "Nothing" identity
$ _hyperdataDocument_source doc $ _hyperdataDocument_source doc
...@@ -325,14 +348,10 @@ extractNgramsT' lang hd = filterNgramsT 255 <$> extractNgramsT'' lang hd ...@@ -325,14 +348,10 @@ extractNgramsT' lang hd = filterNgramsT 255 <$> extractNgramsT'' lang hd
$ maybe ["Nothing"] (splitOn ", ") $ maybe ["Nothing"] (splitOn ", ")
$ _hyperdataDocument_authors doc $ _hyperdataDocument_authors doc
leText = catMaybes [ _hyperdataDocument_title doc
, _hyperdataDocument_abstract doc
]
terms' <- map text2ngrams terms' <- map text2ngrams
<$> map (intercalate " " . _terms_label) <$> map (intercalate " " . _terms_label)
<$> concat <$> concat
<$> liftIO (extractTerms lang' leText) <$> liftIO (extractTerms lang' $ hasText doc)
pure $ Map.fromList $ [(source, Map.singleton Sources 1)] pure $ Map.fromList $ [(source, Map.singleton Sources 1)]
<> [(i', Map.singleton Institutes 1) | i' <- institutes ] <> [(i', Map.singleton Institutes 1) | i' <- institutes ]
......
...@@ -72,10 +72,17 @@ makeLenses ''TermType ...@@ -72,10 +72,17 @@ makeLenses ''TermType
-- | Sugar to extract terms from text (hiddeng mapM from end user). -- | Sugar to extract terms from text (hiddeng mapM from end user).
--extractTerms :: Traversable t => TermType Lang -> t Text -> IO (t [Terms]) --extractTerms :: Traversable t => TermType Lang -> t Text -> IO (t [Terms])
extractTerms :: TermType Lang -> [Text] -> IO [[Terms]] extractTerms :: TermType Lang -> [Text] -> IO [[Terms]]
extractTerms (Unsupervised l n m) xs = mapM (terms (Unsupervised l n m')) xs
extractTerms (Unsupervised l n m) xs = mapM (terms (Unsupervised l n (Just m'))) xs
where where
m' = maybe (Just $ newTries n (Text.intercalate " " xs)) Just m m' = case m of
Just m''-> m''
Nothing -> newTries n (Text.intercalate " " xs)
extractTerms termTypeLang xs = mapM (terms termTypeLang) xs extractTerms termTypeLang xs = mapM (terms termTypeLang) xs
------------------------------------------------------------------------ ------------------------------------------------------------------------
-- | Terms from Text -- | Terms from Text
-- Mono : mono terms -- Mono : mono terms
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment