Hetero.purs 2.63 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13
{-|
Module      : Gargantext.
Description : 
Copyright   : (c) CNRS, 2017-Present
License     : AGPL + CECILL v3
Maintainer  : team@gargantext.org
Stability   : experimental
Portability : POSIX

Here is a longer description of this module, containing some
commentary with @some markup@.
-}

14
module Gargantext.Text.Hetero where
15 16 17 18 19 20 21 22 23

import GHC.Real as R
import Data.Set as S
import Data.Map as M
import Data.List.Split as S
import Database.PostgreSQL.Simple as PGS
import Opaleye.PGTypes (PGInt4)
import Opaleye.Internal.Column (Column)

Alexandre Delanoë's avatar
Alexandre Delanoë committed
24 25 26
import Gargantext.Database.Gargandb
import Gargantext.Database.Private
--import Gargantext.Utils.Chronos
27

28 29
import Gargantext.Text.Words (cleanText)
import Gargantext.Text.Count (occurrences)
30

Alexandre Delanoë's avatar
Alexandre Delanoë committed
31
import Gargantext.Database.Simple
32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88

--main = do
--    t <- getTextquery
--    print (Prelude.map (heterogeinity . concat) $ S.chunksOf 3 t) 

-- heterogeinity sur concat texts
heterogeinity' :: Int -> Int -> Int -> IO [Integer]
heterogeinity' corpus_id limit x = do
    t <- getAbstract corpus_id limit
    Prelude.mapM (dicoStruct . occurrences) $ (S.chunksOf x) .  cleanText $ concat t

heterogeinity'' :: Int -> Int -> Int -> IO [Integer]
heterogeinity'' corpus_id limit size = do
    t <- getAbstract corpus_id limit
    Prelude.mapM (dicoStruct . occurrences) $ (S.chunksOf size) .  cleanText $ concat t


dicoStruct :: (Integral r, Monad m) => M.Map t r -> m r
dicoStruct dict_occ = do
    let keys_size = toInteger $ length $ M.keys dict_occ
    let total_occ = sum $ Prelude.map (\(x, y) -> y) $ M.toList dict_occ
    return $ div total_occ (fromIntegral keys_size)

-- heterogeinity sur UCT (Unité de Context Textuel)
heterogeinity :: [Char] -> IO Integer
heterogeinity string = do
    let dict_occ = occurrences $ cleanText string
    
    let keys_size = toInteger $ length $ M.keys dict_occ
    let total_occ = sum $ Prelude.map (\(x, y) -> y) $ M.toList dict_occ
    
    return $ div total_occ (fromIntegral keys_size)
    

--computeHeterogeinity
--  :: Fractional t =>
--       Opaleye.Internal.Column.Column Opaleye.PGTypes.PGInt4
--            -> IO (t, Integer, Integer)
computeHeterogeinity corpus_id = do
    c <- PGS.connect infoGargandb
    t <- getText c (nodeHyperdataText corpus_id)
    heterogeinity $ Prelude.concat t

main2 = do
    let corpus_ids = [
                ("ALL", 272927) -- 73
               ,("Histoire", 1387736) -- 28
               ,("Sciences Po", 1296892) -- 37
               ,("Phylosophie", 1170004) -- 20
               ,("Psychologie", 1345852) -- 37
               ,("Sociologie", 1246452)  -- 42
               ]
    
    r <- Prelude.map computeHeterogeinity $ Prelude.map (\(t,id) -> id) corpus_ids
    return r