Commit 7665bcb6 authored by Alexandre Delanoë's avatar Alexandre Delanoë

[TERMS] main function.

parent c0914f9a
...@@ -47,6 +47,7 @@ import Protolude ( Bool(True, False), Int, Double, Integer ...@@ -47,6 +47,7 @@ import Protolude ( Bool(True, False), Int, Double, Integer
, elem, die, mod, div, const, either , elem, die, mod, div, const, either
, curry, uncurry , curry, uncurry
, otherwise, when , otherwise, when
, undefined
) )
-- TODO import functions optimized in Utils.Count -- TODO import functions optimized in Utils.Count
......
...@@ -35,16 +35,6 @@ data Group = Group { _group_label :: Terms ...@@ -35,16 +35,6 @@ data Group = Group { _group_label :: Terms
} deriving (Show) } deriving (Show)
clean :: Text -> Text
clean txt = DT.map clean' txt
where
clean' '’' = '\''
clean' c = c
--noApax :: Ord a => Map a Occ -> Map a Occ
--noApax m = M.filter (>1) m
------------------------------------------------------------------- -------------------------------------------------------------------
-- Contexts of text -- Contexts of text
sentences :: Text -> [Text] sentences :: Text -> [Text]
...@@ -84,4 +74,3 @@ testText_fr = DT.pack "La fouille de textes ou « l'extraction de connaissances ...@@ -84,4 +74,3 @@ testText_fr = DT.pack "La fouille de textes ou « l'extraction de connaissances
-- group ngrams -- group ngrams
--ocs = occ $ ws --ocs = occ $ ws
...@@ -12,84 +12,14 @@ Mainly reexport functions in @Data.Text.Metrics@ ...@@ -12,84 +12,14 @@ Mainly reexport functions in @Data.Text.Metrics@
{-# LANGUAGE NoImplicitPrelude #-} {-# LANGUAGE NoImplicitPrelude #-}
module Gargantext.Text.Metrics where
module Gargantext.Text.Metrics (levenshtein --import Data.Text (Text)
, levenshteinNorm --import GHC.Real (Ratio)
, damerauLevenshtein --import qualified Data.Text.Metrics as DTM
, damerauLevenshteinNorm
, overlap
, jaccard
, hamming
) where
import Data.Text (Text)
import GHC.Real (Ratio)
import qualified Data.Text.Metrics as DTM
import Gargantext.Prelude
{- * Example de titre
-}
-- | This module provide metrics to compare Text
-- starting as an API rexporting main functions of the great lib
-- text-metrics of Mark Karpov
-- | Levenshtein Distance
-- In information theory, Linguistics and computer science,
-- the Levenshtein distance is a string metric for measuring
-- the difference between two sequences.
-- See: https://en.wikipedia.org/wiki/Levenshtein_distance
-- --
levenshtein :: Text -> Text -> Int --import Gargantext.Prelude
levenshtein = DTM.levenshtein
-- | Return normalized Levenshtein distance between two 'Text' values.
-- Result is a non-negative rational number (represented as @'Ratio'
-- 'Data.Numeric.Natural'@), where 0 signifies no similarity between the
-- strings, while 1 means exact match.
-- --
levenshteinNorm :: Text -> Text -> Ratio Int --noApax :: Ord a => Map a Occ -> Map a Occ
levenshteinNorm = DTM.levenshteinNorm --noApax m = M.filter (>1) m
-- | Return Damerau-Levenshtein distance between two 'Text' values. The
-- function works like 'levenshtein', but the collection of allowed
-- operations also includes transposition of two /adjacent/ characters.
-- See also:
-- <https://en.wikipedia.org/wiki/Damerau%E2%80%93Levenshtein_distance>
--
damerauLevenshtein :: Text -> Text -> Int
damerauLevenshtein = DTM.damerauLevenshtein
-- damerau-Levenshtein distance normalized
--
damerauLevenshteinNorm :: Text -> Text -> Ratio Int
damerauLevenshteinNorm = DTM.damerauLevenshteinNorm
-- Treating inputs like sets
-- | Return overlap coefficient for two 'Text' values. Returned value
-- is in the range from 0 (no similarity) to 1 (exact match). Return 1
-- if both 'Text' values are empty.
--
-- See also: <https://en.wikipedia.org/wiki/Overlap_coefficient>.
overlap :: Text -> Text -> Ratio Int
overlap = DTM.overlap
-- | Jaccard distance
-- measures dissimilarity between sample sets
jaccard :: Text -> Text -> Ratio Int
jaccard = DTM.jaccard
-- | Hamming Distance
-- In information theory, the Hamming distance between two strings of
-- equal length is the number of positions at which the corresponding
-- symbols are different. In other words, it measures the minimum number of
-- substitutions required to change one string into the other
-- See: https://en.wikipedia.org/wiki/Hamming_distance
hamming :: Text -> Text -> Maybe Int
hamming = DTM.hamming
{-|
Module : Gargantext.Text.Metrics.CharByChar
Description : All parsers of Gargantext in one file.
Copyright : (c) CNRS, 2017 - present
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
Mainly reexport functions in @Data.Text.Metrics@
-}
{-# LANGUAGE NoImplicitPrelude #-}
module Gargantext.Text.Metrics.CharByChar (levenshtein
, levenshteinNorm
, damerauLevenshtein
, damerauLevenshteinNorm
, overlap
, jaccard
, hamming
) where
import Data.Text (Text)
import GHC.Real (Ratio)
import qualified Data.Text.Metrics as DTM
import Gargantext.Prelude
--noApax :: Ord a => Map a Occ -> Map a Occ
--noApax m = M.filter (>1) m
{- * Example de titre
-}
-- | This module provide metrics to compare Text
-- starting as an API rexporting main functions of the great lib
-- text-metrics of Mark Karpov
-- | Levenshtein Distance
-- In information theory, Linguistics and computer science,
-- the Levenshtein distance is a string metric for measuring
-- the difference between two sequences.
-- See: https://en.wikipedia.org/wiki/Levenshtein_distance
--
levenshtein :: Text -> Text -> Int
levenshtein = DTM.levenshtein
-- | Return normalized Levenshtein distance between two 'Text' values.
-- Result is a non-negative rational number (represented as @'Ratio'
-- 'Data.Numeric.Natural'@), where 0 signifies no similarity between the
-- strings, while 1 means exact match.
--
levenshteinNorm :: Text -> Text -> Ratio Int
levenshteinNorm = DTM.levenshteinNorm
-- | Return Damerau-Levenshtein distance between two 'Text' values. The
-- function works like 'levenshtein', but the collection of allowed
-- operations also includes transposition of two /adjacent/ characters.
-- See also:
-- <https://en.wikipedia.org/wiki/Damerau%E2%80%93Levenshtein_distance>
--
damerauLevenshtein :: Text -> Text -> Int
damerauLevenshtein = DTM.damerauLevenshtein
-- damerau-Levenshtein distance normalized
--
damerauLevenshteinNorm :: Text -> Text -> Ratio Int
damerauLevenshteinNorm = DTM.damerauLevenshteinNorm
-- Treating inputs like sets
-- | Return overlap coefficient for two 'Text' values. Returned value
-- is in the range from 0 (no similarity) to 1 (exact match). Return 1
-- if both 'Text' values are empty.
--
-- See also: <https://en.wikipedia.org/wiki/Overlap_coefficient>.
overlap :: Text -> Text -> Ratio Int
overlap = DTM.overlap
-- | Jaccard distance
-- measures dissimilarity between sample sets
jaccard :: Text -> Text -> Ratio Int
jaccard = DTM.jaccard
-- | Hamming Distance
-- In information theory, the Hamming distance between two strings of
-- equal length is the number of positions at which the corresponding
-- symbols are different. In other words, it measures the minimum number of
-- substitutions required to change one string into the other
-- See: https://en.wikipedia.org/wiki/Hamming_distance
hamming :: Text -> Text -> Maybe Int
hamming = DTM.hamming
...@@ -37,6 +37,7 @@ import Data.Either.Extra(Either()) ...@@ -37,6 +37,7 @@ import Data.Either.Extra(Either())
import Data.Text (Text) import Data.Text (Text)
import Data.Text.Encoding (decodeUtf8) import Data.Text.Encoding (decodeUtf8)
import qualified Data.Text as DT
---- ----
--import Control.Monad (join) --import Control.Monad (join)
import Codec.Archive.Zip (withArchive, getEntry, getEntries) import Codec.Archive.Zip (withArchive, getEntry, getEntries)
...@@ -108,4 +109,10 @@ openZip fp = do ...@@ -108,4 +109,10 @@ openZip fp = do
bs <- mapConcurrently (\s -> withArchive path (getEntry s)) entries bs <- mapConcurrently (\s -> withArchive path (getEntry s)) entries
pure bs pure bs
clean :: Text -> Text
clean txt = DT.map clean' txt
where
clean' '’' = '\''
clean' c = c
...@@ -19,8 +19,6 @@ Using Latin numerical prefixes, an n-gram of size 1 is referred to as a ...@@ -19,8 +19,6 @@ Using Latin numerical prefixes, an n-gram of size 1 is referred to as a
Source: https://en.wikipedia.org/wiki/Ngrams Source: https://en.wikipedia.org/wiki/Ngrams
TODO TODO
-- Prelude.concat <$> Prelude.map (filter (\n -> _my_token_pos n == Just NP)) <$> extractNgrams Gargantext.Core.EN testText_en
group Ngrams -> Tree group Ngrams -> Tree
compute occ by node of Tree compute occ by node of Tree
group occs according groups group occs according groups
...@@ -35,10 +33,27 @@ compute graph ...@@ -35,10 +33,27 @@ compute graph
module Gargantext.Text.Terms module Gargantext.Text.Terms
where where
import Data.Text (Text)
import Gargantext.Prelude
import Gargantext.Core
import Gargantext.Core.Types import Gargantext.Core.Types
import Gargantext.Text.Terms.Multi (multiterms)
import Gargantext.Text.Terms.Mono (monoterms')
data TermType = Mono | Multi
------------------------------------------------------------------------ ------------------------------------------------------------------------
tokenTag2terms :: TokenTag -> Terms terms :: TermType -> Maybe Lang -> Text -> IO [Terms]
tokenTag2terms (TokenTag w t _ _) = Terms w t terms Mono (Just lang) txt = pure $ monoterms' lang txt
terms Multi (Just lang ) txt = multiterms lang txt
terms _ Nothing _ = panic "Lang needed"
------------------------------------------------------------------------ ------------------------------------------------------------------------
termTests :: Text
termTests = "It is hard to detect important articles in a specific context. Information retrieval techniques based on full text search can be inaccurate to identify main topics and they are not able to provide an indication about the importance of the article. Generating a citation network is a good way to find most popular articles but this approach is not context aware. The text around a citation mark is generally a good summary of the referred article. So citation context analysis presents an opportunity to use the wisdom of crowd for detecting important articles in a context sensitive way. In this work, we analyze citation contexts to rank articles properly for a given topic. The model proposed uses citation contexts in order to create a directed and edge-labeled citation network based on the target topic. Then we apply common ranking algorithms in order to find important articles in this newly created network. We showed that this method successfully detects a good subset of most prominent articles in a given topic. The biggest contribution of this approach is that we are able to identify important articles for a given search term even though these articles do not contain this search term. This technique can be used in other linked documents including web pages, legal documents, and patents as well as scientific papers."
...@@ -13,7 +13,7 @@ Mono-terms are Nterms where n == 1. ...@@ -13,7 +13,7 @@ Mono-terms are Nterms where n == 1.
{-# LANGUAGE NoImplicitPrelude #-} {-# LANGUAGE NoImplicitPrelude #-}
module Gargantext.Text.Terms.Mono module Gargantext.Text.Terms.Mono (monoterms, monoterms')
where where
import Data.Text (Text, toLower, split, splitOn, pack) import Data.Text (Text, toLower, split, splitOn, pack)
...@@ -26,12 +26,14 @@ import Gargantext.Text.Terms.Mono.Stem (stem) ...@@ -26,12 +26,14 @@ import Gargantext.Text.Terms.Mono.Stem (stem)
import Gargantext.Prelude import Gargantext.Prelude
import Data.Char (isAlphaNum, isSpace) import Data.Char (isAlphaNum, isSpace)
monoterms :: Text -> [Term] monoterms' :: Lang -> Text -> [Terms]
monoterms' l txt = map (text2terms l) $ monoterms txt
monoterms :: Text -> [Text]
monoterms txt = map toLower $ split isWord txt monoterms txt = map toLower $ split isWord txt
where where
isWord c = c `elem` [' ', '\'', ',', ';'] isWord c = c `elem` [' ', '\'', ',', ';']
text2terms :: Lang -> Text -> Terms text2terms :: Lang -> Text -> Terms
text2terms lang txt = Terms label stems text2terms lang txt = Terms label stems
where where
......
...@@ -23,11 +23,20 @@ module Gargantext.Text.Terms.Mono.Token (tokenize) ...@@ -23,11 +23,20 @@ module Gargantext.Text.Terms.Mono.Token (tokenize)
import Data.Text (Text) import Data.Text (Text)
import qualified Gargantext.Text.Terms.Mono.Token.En as En import qualified Gargantext.Text.Terms.Mono.Token.En as En
import Gargantext.Core (Lang(..))
import Gargantext.Prelude
type Token = Text type Token = Text
-- >>> tokenize "A rose is a rose is a rose." -- >>> tokenize "A rose is a rose is a rose."
-- ["A","rose","is","a","rose","is","a","rose", "."] -- ["A","rose","is","a","rose","is","a","rose", "."]
-- --
data Context = Letter | Word | Sentence | Line | Paragraph
tokenize :: Text -> [Token] tokenize :: Text -> [Token]
tokenize = En.tokenize tokenize = En.tokenize
tokenize' :: Lang -> Context -> [Token]
tokenize' = undefined
...@@ -13,10 +13,11 @@ Multi-terms are ngrams where n > 1. ...@@ -13,10 +13,11 @@ Multi-terms are ngrams where n > 1.
{-# LANGUAGE NoImplicitPrelude #-} {-# LANGUAGE NoImplicitPrelude #-}
module Gargantext.Text.Terms.Multi (extractTokenTags) module Gargantext.Text.Terms.Multi (multiterms)
where where
import Data.Text hiding (map, group) import Data.Text hiding (map, group, filter, concat)
import Data.List (concat)
import Gargantext.Prelude import Gargantext.Prelude
import Gargantext.Core (Lang(..)) import Gargantext.Core (Lang(..))
...@@ -26,13 +27,21 @@ import Gargantext.Text.Terms.Multi.PosTagging ...@@ -26,13 +27,21 @@ import Gargantext.Text.Terms.Multi.PosTagging
import qualified Gargantext.Text.Terms.Multi.Lang.En as En import qualified Gargantext.Text.Terms.Multi.Lang.En as En
import qualified Gargantext.Text.Terms.Multi.Lang.Fr as Fr import qualified Gargantext.Text.Terms.Multi.Lang.Fr as Fr
multiterms :: Lang -> Text -> IO [Terms]
multiterms lang txt = concat
<$> map (map tokenTag2terms)
<$> map (filter (\t -> _my_token_pos t == Just NP))
<$> tokenTags lang txt
extractTokenTags :: Lang -> Text -> IO [[TokenTag]] tokenTag2terms :: TokenTag -> Terms
extractTokenTags lang s = map (group lang) <$> extractTokenTags' lang s tokenTag2terms (TokenTag w t _ _) = Terms w t
tokenTags :: Lang -> Text -> IO [[TokenTag]]
tokenTags lang s = map (group lang) <$> tokenTags' lang s
extractTokenTags' :: Lang -> Text -> IO [[TokenTag]]
extractTokenTags' lang t = map tokens2tokensTags tokenTags' :: Lang -> Text -> IO [[TokenTag]]
tokenTags' lang t = map tokens2tokensTags
<$> map _sentenceTokens <$> map _sentenceTokens
<$> _sentences <$> _sentences
<$> corenlp lang t <$> corenlp lang t
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment