Commit cb739183 authored by Alexandre Delanoë's avatar Alexandre Delanoë Committed by Alfredo Di Napoli

[SPECS] Main Specifications to add Chinese lang

parent f0674838
...@@ -24,6 +24,7 @@ import Data.Set (Set) ...@@ -24,6 +24,7 @@ import Data.Set (Set)
import Data.Text (Text, concat, pack, splitOn) import Data.Text (Text, concat, pack, splitOn)
import Data.Vector (Vector) import Data.Vector (Vector)
import Gargantext.API.Admin.EnvTypes (Env, GargJob(..)) import Gargantext.API.Admin.EnvTypes (Env, GargJob(..))
import Gargantext.Core (Lang)
import Gargantext.API.Admin.Orchestrator.Types import Gargantext.API.Admin.Orchestrator.Types
import Gargantext.API.Ngrams (setListNgrams) import Gargantext.API.Ngrams (setListNgrams)
import Gargantext.API.Ngrams.List.Types import Gargantext.API.Ngrams.List.Types
...@@ -34,7 +35,7 @@ import Gargantext.API.Prelude (GargServer, GargM, GargError) ...@@ -34,7 +35,7 @@ import Gargantext.API.Prelude (GargServer, GargM, GargError)
import Gargantext.API.Types import Gargantext.API.Types
import Gargantext.Core.NodeStory import Gargantext.Core.NodeStory
import Gargantext.Core.Text.Terms (ExtractedNgrams(..)) import Gargantext.Core.Text.Terms (ExtractedNgrams(..))
import Gargantext.Core.Text.Terms.WithList (MatchedText, buildPatterns, termsInText) import Gargantext.Core.Text.Terms.WithList (MatchedText, buildPatternsWith, termsInText)
import Gargantext.Core.Types (TermsCount) import Gargantext.Core.Types (TermsCount)
import Gargantext.Core.Types.Main (ListType(..)) import Gargantext.Core.Types.Main (ListType(..))
import Gargantext.Database.Action.Flow (saveDocNgramsWith) import Gargantext.Database.Action.Flow (saveDocNgramsWith)
...@@ -142,7 +143,7 @@ setList l m = do ...@@ -142,7 +143,7 @@ setList l m = do
pure True pure True
------------------------------------------------------------------------ ------------------------------------------------------------------------
-- | Re-index documents of a corpus with new ngrams (called orphans here) -- | Re-index documents of a corpus with ngrams in the list
reIndexWith :: ( HasNodeStory env err m reIndexWith :: ( HasNodeStory env err m
, FlowCmdM env err m , FlowCmdM env err m
) )
...@@ -153,6 +154,7 @@ reIndexWith :: ( HasNodeStory env err m ...@@ -153,6 +154,7 @@ reIndexWith :: ( HasNodeStory env err m
-> m () -> m ()
reIndexWith cId lId nt lts = do reIndexWith cId lId nt lts = do
-- printDebug "(cId,lId,nt,lts)" (cId, lId, nt, lts) -- printDebug "(cId,lId,nt,lts)" (cId, lId, nt, lts)
-- corpus_node <- getNode cId -- (Proxy :: Proxy HyperdataCorpus)
-- Getting [NgramsTerm] -- Getting [NgramsTerm]
ts <- List.concat ts <- List.concat
...@@ -166,24 +168,24 @@ reIndexWith cId lId nt lts = do ...@@ -166,24 +168,24 @@ reIndexWith cId lId nt lts = do
let let
-- fromListWith (<>) -- fromListWith (<>)
ngramsByDoc = map (HashMap.fromListWith (Map.unionWith (Map.unionWith (\(_a,b) (_a',b') -> (1,b+b'))))) ngramsByDoc = map (HashMap.fromListWith (Map.unionWith (Map.unionWith (\(_a,b) (_a',b') -> (1,b+b')))))
$ map (map (\((k, cnt), v) -> (SimpleNgrams (text2ngrams k), over (traverse . traverse) (\p -> (p, cnt)) v))) $ map (map (\((k, cnt), v) -> (SimpleNgrams (text2ngrams k), over (traverse . traverse) (\p -> (p, cnt)) v)))
$ map (docNgrams nt ts) docs $ map (docNgrams Nothing {-here lang-} nt ts) docs
-- printDebug "ngramsByDoc: " ngramsByDoc
-- Saving the indexation in database -- Saving the indexation in database
_ <- mapM (saveDocNgramsWith lId) ngramsByDoc _ <- mapM (saveDocNgramsWith lId) ngramsByDoc
-- _ <- refreshNgramsMaterialized
pure () pure ()
docNgrams :: NgramsType
docNgrams :: Maybe Lang
-> NgramsType
-> [NgramsTerm] -> [NgramsTerm]
-> Gargantext.Database.Admin.Types.Node.Context HyperdataDocument -> Gargantext.Database.Admin.Types.Node.Context HyperdataDocument
-> [((MatchedText, TermsCount), -> [((MatchedText, TermsCount),
Map NgramsType (Map NodeId Int))] Map NgramsType (Map NodeId Int))]
docNgrams nt ts doc = docNgrams lang nt ts doc =
List.zip List.zip
(termsInText (buildPatterns $ map (\k -> (Text.splitOn " " $ unNgramsTerm k, [])) ts) (termsInText lang (buildPatternsWith lang ts)
$ Text.unlines $ catMaybes $ Text.unlines $ catMaybes
[ doc ^. context_hyperdata . hd_title [ doc ^. context_hyperdata . hd_title
, doc ^. context_hyperdata . hd_abstract , doc ^. context_hyperdata . hd_abstract
......
...@@ -205,7 +205,7 @@ addToCorpusWithQuery user cid (WithQuery { _wq_query = q ...@@ -205,7 +205,7 @@ addToCorpusWithQuery user cid (WithQuery { _wq_query = q
-- printDebug "[addToCorpusWithQuery] datafield" datafield -- printDebug "[addToCorpusWithQuery] datafield" datafield
-- printDebug "[addToCorpusWithQuery] flowListWith" flw -- printDebug "[addToCorpusWithQuery] flowListWith" flw
-- TODO -- TODO: update Node Corpus with the Lang
-- n <- getNode cid -- n <- getNode cid
-- let n.wq_lang = l -- let n.wq_lang = l
-- saveNode n -- saveNode n
...@@ -270,7 +270,7 @@ addToCorpusWithForm user cid (NewWithForm ft ff d l _n sel) jobHandle = do ...@@ -270,7 +270,7 @@ addToCorpusWithForm user cid (NewWithForm ft ff d l _n sel) jobHandle = do
-- printDebug "[addToCorpusWithForm] fileType" ft -- printDebug "[addToCorpusWithForm] fileType" ft
-- printDebug "[addToCorpusWithForm] fileFormat" ff -- printDebug "[addToCorpusWithForm] fileFormat" ff
-- TODO -- TODO: update Node Corpus with the Lang
-- n <- getNode cid -- n <- getNode cid
-- let n.wq_lang = l -- let n.wq_lang = l
-- saveNode n -- saveNode n
...@@ -380,7 +380,7 @@ addToCorpusWithFile :: (HasSettings env, FlowCmdM env err m, MonadJobStatus m) ...@@ -380,7 +380,7 @@ addToCorpusWithFile :: (HasSettings env, FlowCmdM env err m, MonadJobStatus m)
-> m () -> m ()
addToCorpusWithFile user cid nwf@(NewWithFile _d _l fName) jobHandle = do addToCorpusWithFile user cid nwf@(NewWithFile _d _l fName) jobHandle = do
-- TODO -- TODO: update Node Corpus with the Lang
-- n <- getNode cid -- n <- getNode cid
-- let n.wq_lang = l -- let n.wq_lang = l
-- saveNode n -- saveNode n
......
...@@ -18,7 +18,9 @@ module Gargantext.Core.Text.Terms.WithList where ...@@ -18,7 +18,9 @@ module Gargantext.Core.Text.Terms.WithList where
import Data.List (null) import Data.List (null)
import Data.Ord import Data.Ord
import Data.Text (Text, concat, unwords) import Data.Text (Text, concat, unwords)
import Gargantext.API.Ngrams.Types (NgramsTerm(..))
import Gargantext.Prelude import Gargantext.Prelude
import Gargantext.Core (Lang(ZH))
import Gargantext.Core.Text.Context import Gargantext.Core.Text.Context
import Gargantext.Core.Text.Terms.Mono (monoTextsBySentence) import Gargantext.Core.Text.Terms.Mono (monoTextsBySentence)
import Gargantext.Core.Types (TermsCount) import Gargantext.Core.Types (TermsCount)
...@@ -27,6 +29,8 @@ import Prelude (error) ...@@ -27,6 +29,8 @@ import Prelude (error)
import qualified Data.Algorithms.KMP as KMP import qualified Data.Algorithms.KMP as KMP
import qualified Data.IntMap.Strict as IntMap import qualified Data.IntMap.Strict as IntMap
import qualified Data.List as List import qualified Data.List as List
import qualified Data.Text as Text
------------------------------------------------------------------------ ------------------------------------------------------------------------
data Pattern = Pattern data Pattern = Pattern
...@@ -63,6 +67,10 @@ replaceTerms rplaceTerms pats terms = go 0 ...@@ -63,6 +67,10 @@ replaceTerms rplaceTerms pats terms = go 0
merge (len1, lab1) (len2, lab2) = merge (len1, lab1) (len2, lab2) =
if len2 < len1 then (len1, lab1) else (len2, lab2) if len2 < len1 then (len1, lab1) else (len2, lab2)
buildPatternsWith :: Maybe Lang -> [NgramsTerm] -> Patterns
buildPatternsWith (Just ZH) ts = buildPatterns $ map (\k -> (Text.chunksOf 1 $ unNgramsTerm k, [])) ts
buildPatternsWith _ ts = buildPatterns $ map (\k -> (Text.splitOn " " $ unNgramsTerm k, [])) ts
buildPatterns :: TermList -> Patterns buildPatterns :: TermList -> Patterns
buildPatterns = sortWith (Down . _pat_length) . concatMap buildPattern buildPatterns = sortWith (Down . _pat_length) . concatMap buildPattern
where where
...@@ -78,14 +86,14 @@ buildPatterns = sortWith (Down . _pat_length) . concatMap buildPattern ...@@ -78,14 +86,14 @@ buildPatterns = sortWith (Down . _pat_length) . concatMap buildPattern
-------------------------------------------------------------------------- --------------------------------------------------------------------------
-- Utils -- Utils
type MatchedText = Text type MatchedText = Text
termsInText :: Patterns -> Text -> [(MatchedText, TermsCount)] termsInText :: Maybe Lang -> Patterns -> Text -> [(MatchedText, TermsCount)]
termsInText pats txt = groupWithCounts termsInText (Just ZH) pats txt = termsInText Nothing pats (addSpaces txt)
termsInText _ pats txt = groupWithCounts
$ List.concat $ List.concat
$ map (map unwords) $ map (map unwords)
$ extractTermsWithList pats txt $ extractTermsWithList pats txt
-------------------------------------------------------------------------- --------------------------------------------------------------------------
extractTermsWithList :: Patterns -> Text -> Corpus [Text] extractTermsWithList :: Patterns -> Text -> Corpus [Text]
extractTermsWithList pats = map (replaceTerms KeepAll pats) . monoTextsBySentence extractTermsWithList pats = map (replaceTerms KeepAll pats) . monoTextsBySentence
...@@ -97,6 +105,11 @@ extractTermsWithList' :: Patterns -> Text -> [Text] ...@@ -97,6 +105,11 @@ extractTermsWithList' :: Patterns -> Text -> [Text]
extractTermsWithList' pats = map (concat . map concat . replaceTerms KeepAll pats) extractTermsWithList' pats = map (concat . map concat . replaceTerms KeepAll pats)
. monoTextsBySentence . monoTextsBySentence
--------------------------------------------------------------------------
addSpaces :: Text -> Text
addSpaces = (Text.intercalate " ") . (Text.chunksOf 1)
-------------------------------------------------------------------------- --------------------------------------------------------------------------
{- | Not used {- | Not used
......
...@@ -107,8 +107,9 @@ corpusIdtoDocuments timeUnit corpusId = do ...@@ -107,8 +107,9 @@ corpusIdtoDocuments timeUnit corpusId = do
Just termList' -> buildPatterns termList' Just termList' -> buildPatterns termList'
pure $ map (toPhyloDocs patterns timeUnit) (map _context_hyperdata docs) pure $ map (toPhyloDocs patterns timeUnit) (map _context_hyperdata docs)
-- TODO: Add lang to enable Chinese phylomemy
termsInText' :: Patterns -> Text -> [Text] termsInText' :: Patterns -> Text -> [Text]
termsInText' p t = (map fst) $ termsInText p t termsInText' p t = (map fst) $ termsInText Nothing p t
toPhyloDocs :: Patterns -> TimeUnit -> HyperdataDocument -> Document toPhyloDocs :: Patterns -> TimeUnit -> HyperdataDocument -> Document
toPhyloDocs patterns time d = toPhyloDocs patterns time d =
......
...@@ -65,7 +65,7 @@ flowPhylo cId = do ...@@ -65,7 +65,7 @@ flowPhylo cId = do
patterns = buildPatterns termList patterns = buildPatterns termList
-- | To filter the Ngrams of a document based on the termList -- | To filter the Ngrams of a document based on the termList
filterTerms :: Patterns -> (Date, Text) -> (Date, [Text]) filterTerms :: Patterns -> (Date, Text) -> (Date, [Text])
filterTerms patterns' (y,d) = (y, fst <$> termsInText patterns' d) filterTerms patterns' (y,d) = (y, fst <$> termsInText Nothing patterns' d)
docs = map ((\(y,t) -> Document y t) . filterTerms patterns) docs' docs = map ((\(y,t) -> Document y t) . filterTerms patterns) docs'
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment