Commit cb739183 authored by Alexandre Delanoë's avatar Alexandre Delanoë Committed by Alfredo Di Napoli

[SPECS] Main Specifications to add Chinese lang

parent f0674838
......@@ -24,6 +24,7 @@ import Data.Set (Set)
import Data.Text (Text, concat, pack, splitOn)
import Data.Vector (Vector)
import Gargantext.API.Admin.EnvTypes (Env, GargJob(..))
import Gargantext.Core (Lang)
import Gargantext.API.Admin.Orchestrator.Types
import Gargantext.API.Ngrams (setListNgrams)
import Gargantext.API.Ngrams.List.Types
......@@ -34,7 +35,7 @@ import Gargantext.API.Prelude (GargServer, GargM, GargError)
import Gargantext.API.Types
import Gargantext.Core.NodeStory
import Gargantext.Core.Text.Terms (ExtractedNgrams(..))
import Gargantext.Core.Text.Terms.WithList (MatchedText, buildPatterns, termsInText)
import Gargantext.Core.Text.Terms.WithList (MatchedText, buildPatternsWith, termsInText)
import Gargantext.Core.Types (TermsCount)
import Gargantext.Core.Types.Main (ListType(..))
import Gargantext.Database.Action.Flow (saveDocNgramsWith)
......@@ -142,7 +143,7 @@ setList l m = do
pure True
------------------------------------------------------------------------
-- | Re-index documents of a corpus with new ngrams (called orphans here)
-- | Re-index documents of a corpus with ngrams in the list
reIndexWith :: ( HasNodeStory env err m
, FlowCmdM env err m
)
......@@ -153,6 +154,7 @@ reIndexWith :: ( HasNodeStory env err m
-> m ()
reIndexWith cId lId nt lts = do
-- printDebug "(cId,lId,nt,lts)" (cId, lId, nt, lts)
-- corpus_node <- getNode cId -- (Proxy :: Proxy HyperdataCorpus)
-- Getting [NgramsTerm]
ts <- List.concat
......@@ -167,23 +169,23 @@ reIndexWith cId lId nt lts = do
-- fromListWith (<>)
ngramsByDoc = map (HashMap.fromListWith (Map.unionWith (Map.unionWith (\(_a,b) (_a',b') -> (1,b+b')))))
$ map (map (\((k, cnt), v) -> (SimpleNgrams (text2ngrams k), over (traverse . traverse) (\p -> (p, cnt)) v)))
$ map (docNgrams nt ts) docs
-- printDebug "ngramsByDoc: " ngramsByDoc
$ map (docNgrams Nothing {-here lang-} nt ts) docs
-- Saving the indexation in database
_ <- mapM (saveDocNgramsWith lId) ngramsByDoc
-- _ <- refreshNgramsMaterialized
pure ()
docNgrams :: NgramsType
docNgrams :: Maybe Lang
-> NgramsType
-> [NgramsTerm]
-> Gargantext.Database.Admin.Types.Node.Context HyperdataDocument
-> [((MatchedText, TermsCount),
Map NgramsType (Map NodeId Int))]
docNgrams nt ts doc =
docNgrams lang nt ts doc =
List.zip
(termsInText (buildPatterns $ map (\k -> (Text.splitOn " " $ unNgramsTerm k, [])) ts)
(termsInText lang (buildPatternsWith lang ts)
$ Text.unlines $ catMaybes
[ doc ^. context_hyperdata . hd_title
, doc ^. context_hyperdata . hd_abstract
......
......@@ -205,7 +205,7 @@ addToCorpusWithQuery user cid (WithQuery { _wq_query = q
-- printDebug "[addToCorpusWithQuery] datafield" datafield
-- printDebug "[addToCorpusWithQuery] flowListWith" flw
-- TODO
-- TODO: update Node Corpus with the Lang
-- n <- getNode cid
-- let n.wq_lang = l
-- saveNode n
......@@ -270,7 +270,7 @@ addToCorpusWithForm user cid (NewWithForm ft ff d l _n sel) jobHandle = do
-- printDebug "[addToCorpusWithForm] fileType" ft
-- printDebug "[addToCorpusWithForm] fileFormat" ff
-- TODO
-- TODO: update Node Corpus with the Lang
-- n <- getNode cid
-- let n.wq_lang = l
-- saveNode n
......@@ -380,7 +380,7 @@ addToCorpusWithFile :: (HasSettings env, FlowCmdM env err m, MonadJobStatus m)
-> m ()
addToCorpusWithFile user cid nwf@(NewWithFile _d _l fName) jobHandle = do
-- TODO
-- TODO: update Node Corpus with the Lang
-- n <- getNode cid
-- let n.wq_lang = l
-- saveNode n
......
......@@ -18,7 +18,9 @@ module Gargantext.Core.Text.Terms.WithList where
import Data.List (null)
import Data.Ord
import Data.Text (Text, concat, unwords)
import Gargantext.API.Ngrams.Types (NgramsTerm(..))
import Gargantext.Prelude
import Gargantext.Core (Lang(ZH))
import Gargantext.Core.Text.Context
import Gargantext.Core.Text.Terms.Mono (monoTextsBySentence)
import Gargantext.Core.Types (TermsCount)
......@@ -27,6 +29,8 @@ import Prelude (error)
import qualified Data.Algorithms.KMP as KMP
import qualified Data.IntMap.Strict as IntMap
import qualified Data.List as List
import qualified Data.Text as Text
------------------------------------------------------------------------
data Pattern = Pattern
......@@ -63,6 +67,10 @@ replaceTerms rplaceTerms pats terms = go 0
merge (len1, lab1) (len2, lab2) =
if len2 < len1 then (len1, lab1) else (len2, lab2)
buildPatternsWith :: Maybe Lang -> [NgramsTerm] -> Patterns
buildPatternsWith (Just ZH) ts = buildPatterns $ map (\k -> (Text.chunksOf 1 $ unNgramsTerm k, [])) ts
buildPatternsWith _ ts = buildPatterns $ map (\k -> (Text.splitOn " " $ unNgramsTerm k, [])) ts
buildPatterns :: TermList -> Patterns
buildPatterns = sortWith (Down . _pat_length) . concatMap buildPattern
where
......@@ -78,14 +86,14 @@ buildPatterns = sortWith (Down . _pat_length) . concatMap buildPattern
--------------------------------------------------------------------------
-- Utils
type MatchedText = Text
termsInText :: Patterns -> Text -> [(MatchedText, TermsCount)]
termsInText pats txt = groupWithCounts
termsInText :: Maybe Lang -> Patterns -> Text -> [(MatchedText, TermsCount)]
termsInText (Just ZH) pats txt = termsInText Nothing pats (addSpaces txt)
termsInText _ pats txt = groupWithCounts
$ List.concat
$ map (map unwords)
$ extractTermsWithList pats txt
--------------------------------------------------------------------------
extractTermsWithList :: Patterns -> Text -> Corpus [Text]
extractTermsWithList pats = map (replaceTerms KeepAll pats) . monoTextsBySentence
......@@ -97,6 +105,11 @@ extractTermsWithList' :: Patterns -> Text -> [Text]
extractTermsWithList' pats = map (concat . map concat . replaceTerms KeepAll pats)
. monoTextsBySentence
--------------------------------------------------------------------------
addSpaces :: Text -> Text
addSpaces = (Text.intercalate " ") . (Text.chunksOf 1)
--------------------------------------------------------------------------
{- | Not used
......
......@@ -107,8 +107,9 @@ corpusIdtoDocuments timeUnit corpusId = do
Just termList' -> buildPatterns termList'
pure $ map (toPhyloDocs patterns timeUnit) (map _context_hyperdata docs)
-- TODO: Add lang to enable Chinese phylomemy
termsInText' :: Patterns -> Text -> [Text]
termsInText' p t = (map fst) $ termsInText p t
termsInText' p t = (map fst) $ termsInText Nothing p t
toPhyloDocs :: Patterns -> TimeUnit -> HyperdataDocument -> Document
toPhyloDocs patterns time d =
......
......@@ -65,7 +65,7 @@ flowPhylo cId = do
patterns = buildPatterns termList
-- | To filter the Ngrams of a document based on the termList
filterTerms :: Patterns -> (Date, Text) -> (Date, [Text])
filterTerms patterns' (y,d) = (y, fst <$> termsInText patterns' d)
filterTerms patterns' (y,d) = (y, fst <$> termsInText Nothing patterns' d)
docs = map ((\(y,t) -> Document y t) . filterTerms patterns) docs'
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment