Commit 965531a2 authored by Alexandre Delanoë's avatar Alexandre Delanoë

[METRICS] FACTO

parent a22e2dc6
...@@ -48,8 +48,9 @@ main = do ...@@ -48,8 +48,9 @@ main = do
let let
--tt = (Unsupervised EN 6 0 Nothing) --tt = (Unsupervised EN 6 0 Nothing)
tt = (Multi EN) tt = (Multi EN)
format = WOS -- CsvGargV3
cmd :: forall m. FlowCmdM DevEnv GargError m => m CorpusId cmd :: forall m. FlowCmdM DevEnv GargError m => m CorpusId
cmd = flowCorpusFile (cs user) (cs name) (read limit :: Int) tt CsvGargV3 corpusPath cmd = flowCorpusFile (cs user) (cs name) (read limit :: Int) tt format corpusPath
{- {-
let debatCorpus :: forall m. FlowCmdM DevEnv GargError m => m CorpusId let debatCorpus :: forall m. FlowCmdM DevEnv GargError m => m CorpusId
debatCorpus = do debatCorpus = do
......
...@@ -60,8 +60,8 @@ data StopSize = StopSize {unStopSize :: Int} ...@@ -60,8 +60,8 @@ data StopSize = StopSize {unStopSize :: Int}
buildNgramsLists :: Lang -> Int -> Int -> StopSize -> UserCorpusId -> MasterCorpusId buildNgramsLists :: Lang -> Int -> Int -> StopSize -> UserCorpusId -> MasterCorpusId
-> Cmd err (Map NgramsType [NgramsElement]) -> Cmd err (Map NgramsType [NgramsElement])
buildNgramsLists l n m s uCid mCid = do buildNgramsLists l n m s uCid mCid = do
ngTerms <- buildNgramsTermsList l n m s uCid mCid --ngTerms <- buildNgramsTermsList l n m s uCid mCid
--ngTerms <- buildNgramsTermsList' uCid (ngramsGroup l n m) (isStopTerm s . fst) 500 50 ngTerms <- buildNgramsTermsList' uCid (ngramsGroup l n m) (isStopTerm s . fst) 500 50
othersTerms <- mapM (buildNgramsOthersList uCid identity) [Authors, Sources, Institutes] othersTerms <- mapM (buildNgramsOthersList uCid identity) [Authors, Sources, Institutes]
pure $ Map.unions $ othersTerms <> [ngTerms] pure $ Map.unions $ othersTerms <> [ngTerms]
......
...@@ -102,13 +102,26 @@ takeScored listSize incSize = both (map _scored_terms) ...@@ -102,13 +102,26 @@ takeScored listSize incSize = both (map _scored_terms)
linearTakes :: (Ord b1, Ord b2) linearTakes :: (Ord b1, Ord b2)
=> GraphListSize -> InclusionSize => GraphListSize -> InclusionSize
-> (a -> b2) -> (a -> b1) -> [a] -> ([a],[a]) -> (a -> b2) -> (a -> b1) -> [a] -> ([a],[a])
linearTakes gls incSize speGen incExc = (List.splitAt gls) linearTakes mls incSize speGen incExc = (List.splitAt mls)
. List.concat . List.concat
. map (take $ round . map (take $ round
$ (fromIntegral gls :: Double) $ (fromIntegral mls :: Double)
/ (fromIntegral incSize :: Double) / (fromIntegral incSize :: Double)
) )
. map (sortOn speGen) . map (sortOn speGen)
. splitEvery incSize . splitEvery incSize
. take 5000
. takePercent (0.70)
. sortOn incExc . sortOn incExc
takePercent :: Double -> [a] -> [a]
takePercent l xs = List.take l' xs
where
l' = round $ l * (fromIntegral $ List.length xs)
splitTake :: (Int, a -> Bool) -> (Int, a -> Bool) -> [a] -> ([a], [a])
splitTake (a, af) (b, bf) xs = (mpa <> mpb, ca <> cb)
where
(mpa, ca) = List.splitAt a $ List.filter af xs
(mpb, cb) = List.splitAt b $ List.filter bf xs
...@@ -129,11 +129,10 @@ termsUnsupervised _ = undefined ...@@ -129,11 +129,10 @@ termsUnsupervised _ = undefined
newTries :: Int -> Text -> Tries Token () newTries :: Int -> Text -> Tries Token ()
newTries n t = buildTries n (fmap toToken $ uniText t) newTries n t = buildTries n (fmap toToken $ uniText t)
-- | TODO removing long terms > 24
uniText :: Text -> [[Text]] uniText :: Text -> [[Text]]
uniText = uniText = map (List.filter (not . isPunctuation))
-- map (map (Text.toLower))
map (List.filter (not . isPunctuation))
. map tokenize . map tokenize
. sentences -- | TODO get sentences according to lang . sentences -- | TODO get sentences according to lang
. Text.toLower . Text.toLower
{-| {-|
Module : Gargantext.Text.Ngrams.Token.Text Module : Gargantext.Text.Ngrams.Token.Text
Description : Description : Tokenizer main functions
Copyright : (c) Grzegorz Chrupała first, after: CNRS, 2018-Present Copyright : (c) Grzegorz Chrupała first, after: CNRS, 2018-Present
License : AGPL + CECILL v3 License : AGPL + CECILL v3
Maintainer : team@gargantext.org Maintainer : team@gargantext.org
...@@ -77,20 +77,20 @@ run :: Tokenizer -> (Text -> [Text]) ...@@ -77,20 +77,20 @@ run :: Tokenizer -> (Text -> [Text])
run f = \txt -> map T.copy $ (map unwrap . unE . f) txt run f = \txt -> map T.copy $ (map unwrap . unE . f) txt
defaultTokenizer :: Tokenizer defaultTokenizer :: Tokenizer
defaultTokenizer = whitespace defaultTokenizer = whitespace
>=> uris >=> uris
>=> punctuation >=> punctuation
>=> contractions >=> contractions
>=> negatives >=> negatives
-- | Detect common uris and freeze them -- | Detect common uris and freeze them
uris :: Tokenizer uris :: Tokenizer
uris x | isUri x = E [Left x] uris x | isUri x = E [Left x]
| True = E [Right x] | True = E [Right x]
where isUri u = any (`T.isPrefixOf` u) ["http://","ftp://","mailto:"] where isUri u = any (`T.isPrefixOf` u) ["http://","ftp://","mailto:","https://"]
-- | Split off initial and final punctuation -- | Split off initial and final punctuation
punctuation :: Tokenizer punctuation :: Tokenizer
punctuation = finalPunctuation >=> initialPunctuation punctuation = finalPunctuation >=> initialPunctuation
--hyphens :: Tokenizer --hyphens :: Tokenizer
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment