Commit 16f3bbd8 authored by Alexandre Delanoë's avatar Alexandre Delanoë

[WORKFLOW] Unsupervised ngrams extraction implemented.

parent a5bd188c
Pipeline #485 failed with stage
......@@ -20,10 +20,12 @@ Portability : POSIX
{-# LANGUAGE ConstraintKinds #-}
{-# LANGUAGE DeriveGeneric #-}
{-# LANGUAGE FlexibleContexts #-}
{-# LANGUAGE InstanceSigs #-}
{-# LANGUAGE NoImplicitPrelude #-}
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE RankNTypes #-}
{-# LANGUAGE FlexibleContexts #-}
{-# LANGUAGE ConstrainedClassMethods #-}
module Gargantext.Database.Flow -- (flowDatabase, ngrams2list)
where
......@@ -58,14 +60,14 @@ import Gargantext.Database.Utils (Cmd, CmdM)
import Gargantext.Ext.IMT (toSchoolName)
import Gargantext.Ext.IMTUser (deserialiseImtUsersFromFile)
import Gargantext.Prelude
import Gargantext.Text.Terms.Eleve (buildTries, toToken)
import Gargantext.Text.List (buildNgramsLists,StopSize(..))
import Gargantext.Text.Parsers (parseFile, FileFormat)
import Gargantext.Text.Terms (TermType(..), tt_lang)
import Gargantext.Text.Terms (extractTerms)
import Gargantext.Text.Terms (TermType(..), tt_lang, extractTerms, uniText)
import Gargantext.Text.Terms.Mono.Stem.En (stemIt)
import Servant (ServantErr)
import System.FilePath (FilePath)
--import qualified Data.List as List
import qualified Data.List as List
import qualified Data.Map as Map
import qualified Data.Text as Text
import qualified Gargantext.Database.Node.Document.Add as Doc (add)
......@@ -82,6 +84,7 @@ type FlowCorpus a = ( AddUniqId a
, UniqId a
, InsertDb a
, ExtractNgramsT a
, HasText a
)
------------------------------------------------------------------------
......@@ -186,8 +189,17 @@ insertMasterDocs c lang hs = do
ids <- insertDb masterUserId masterCorpusId hs'
let documentsWithId = mergeData (toInserted ids) (Map.fromList $ map viewUniqId' hs')
let
fixLang (Unsupervised l n m) = Unsupervised l n m'
where
m' = case m of
Nothing -> Just $ buildTries n (fmap toToken $ uniText $ Text.intercalate " " $ List.concat $ map hasText documentsWithId)
m'' -> m''
fixLang l = l
lang' = fixLang lang
-- maps :: IO Map Ngrams (Map NgramsType (Map NodeId Int))
maps <- mapNodeIdNgrams <$> documentIdWithNgrams (extractNgramsT lang) documentsWithId
maps <- mapNodeIdNgrams <$> documentIdWithNgrams (extractNgramsT lang') documentsWithId
terms2id <- insertNgrams $ Map.keys maps
let indexedNgrams = Map.mapKeys (indexNgrams terms2id) maps
......@@ -265,6 +277,10 @@ data DocumentWithId a = DocumentWithId
, documentData :: !a
} deriving (Show)
instance HasText a => HasText (DocumentWithId a)
where
hasText (DocumentWithId _ a) = hasText a
mergeData :: Map HashId ReturnId
-> Map HashId a
-> [DocumentWithId a]
......@@ -280,12 +296,18 @@ data DocumentIdWithNgrams a = DocumentIdWithNgrams
, document_ngrams :: !(Map Ngrams (Map NgramsType Int))
} deriving (Show)
-- TODO extractNgrams according to Type of Data
class ExtractNgramsT h
where
extractNgramsT :: TermType Lang -> h -> Cmd err (Map Ngrams (Map NgramsType Int))
extractNgramsT :: HasText h => TermType Lang -> h -> Cmd err (Map Ngrams (Map NgramsType Int))
class HasText h
where
hasText :: h -> [Text]
instance HasText HyperdataContact
where
hasText = undefined
instance ExtractNgramsT HyperdataContact
where
......@@ -300,19 +322,20 @@ instance ExtractNgramsT HyperdataContact
pure $ Map.fromList $ [(a', Map.singleton Authors 1) | a' <- authors ]
instance HasText HyperdataDocument
where
hasText h = catMaybes [ _hyperdataDocument_title h
, _hyperdataDocument_abstract h
]
instance ExtractNgramsT HyperdataDocument
where
extractNgramsT = extractNgramsT'
extractNgramsT' :: TermType Lang -> HyperdataDocument
-> Cmd err (Map Ngrams (Map NgramsType Int))
extractNgramsT' lang hd = filterNgramsT 255 <$> extractNgramsT'' lang hd
extractNgramsT :: TermType Lang -> HyperdataDocument -> Cmd err (Map Ngrams (Map NgramsType Int))
extractNgramsT lang hd = filterNgramsT 255 <$> extractNgramsT' lang hd
where
extractNgramsT'' :: TermType Lang -> HyperdataDocument
extractNgramsT' :: TermType Lang -> HyperdataDocument
-> Cmd err (Map Ngrams (Map NgramsType Int))
extractNgramsT'' lang' doc = do
extractNgramsT' lang' doc = do
let source = text2ngrams
$ maybe "Nothing" identity
$ _hyperdataDocument_source doc
......@@ -325,14 +348,10 @@ extractNgramsT' lang hd = filterNgramsT 255 <$> extractNgramsT'' lang hd
$ maybe ["Nothing"] (splitOn ", ")
$ _hyperdataDocument_authors doc
leText = catMaybes [ _hyperdataDocument_title doc
, _hyperdataDocument_abstract doc
]
terms' <- map text2ngrams
<$> map (intercalate " " . _terms_label)
<$> concat
<$> liftIO (extractTerms lang' leText)
<$> liftIO (extractTerms lang' $ hasText doc)
pure $ Map.fromList $ [(source, Map.singleton Sources 1)]
<> [(i', Map.singleton Institutes 1) | i' <- institutes ]
......
......@@ -72,10 +72,17 @@ makeLenses ''TermType
-- | Sugar to extract terms from text (hiddeng mapM from end user).
--extractTerms :: Traversable t => TermType Lang -> t Text -> IO (t [Terms])
extractTerms :: TermType Lang -> [Text] -> IO [[Terms]]
extractTerms (Unsupervised l n m) xs = mapM (terms (Unsupervised l n m')) xs
extractTerms (Unsupervised l n m) xs = mapM (terms (Unsupervised l n (Just m'))) xs
where
m' = maybe (Just $ newTries n (Text.intercalate " " xs)) Just m
m' = case m of
Just m''-> m''
Nothing -> newTries n (Text.intercalate " " xs)
extractTerms termTypeLang xs = mapM (terms termTypeLang) xs
------------------------------------------------------------------------
-- | Terms from Text
-- Mono : mono terms
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment