Commit 7665bcb6 authored by Alexandre Delanoë's avatar Alexandre Delanoë

[TERMS] main function.

parent c0914f9a
......@@ -47,6 +47,7 @@ import Protolude ( Bool(True, False), Int, Double, Integer
, elem, die, mod, div, const, either
, curry, uncurry
, otherwise, when
, undefined
)
-- TODO import functions optimized in Utils.Count
......
......@@ -35,16 +35,6 @@ data Group = Group { _group_label :: Terms
} deriving (Show)
clean :: Text -> Text
clean txt = DT.map clean' txt
where
clean' '’' = '\''
clean' c = c
--noApax :: Ord a => Map a Occ -> Map a Occ
--noApax m = M.filter (>1) m
-------------------------------------------------------------------
-- Contexts of text
sentences :: Text -> [Text]
......@@ -84,4 +74,3 @@ testText_fr = DT.pack "La fouille de textes ou « l'extraction de connaissances
-- group ngrams
--ocs = occ $ ws
......@@ -12,84 +12,14 @@ Mainly reexport functions in @Data.Text.Metrics@
{-# LANGUAGE NoImplicitPrelude #-}
module Gargantext.Text.Metrics where
module Gargantext.Text.Metrics (levenshtein
, levenshteinNorm
, damerauLevenshtein
, damerauLevenshteinNorm
, overlap
, jaccard
, hamming
) where
import Data.Text (Text)
import GHC.Real (Ratio)
import qualified Data.Text.Metrics as DTM
import Gargantext.Prelude
{- * Example de titre
-}
-- | This module provide metrics to compare Text
-- starting as an API rexporting main functions of the great lib
-- text-metrics of Mark Karpov
-- | Levenshtein Distance
-- In information theory, Linguistics and computer science,
-- the Levenshtein distance is a string metric for measuring
-- the difference between two sequences.
-- See: https://en.wikipedia.org/wiki/Levenshtein_distance
--import Data.Text (Text)
--import GHC.Real (Ratio)
--import qualified Data.Text.Metrics as DTM
--
levenshtein :: Text -> Text -> Int
levenshtein = DTM.levenshtein
-- | Return normalized Levenshtein distance between two 'Text' values.
-- Result is a non-negative rational number (represented as @'Ratio'
-- 'Data.Numeric.Natural'@), where 0 signifies no similarity between the
-- strings, while 1 means exact match.
--import Gargantext.Prelude
--
levenshteinNorm :: Text -> Text -> Ratio Int
levenshteinNorm = DTM.levenshteinNorm
-- | Return Damerau-Levenshtein distance between two 'Text' values. The
-- function works like 'levenshtein', but the collection of allowed
-- operations also includes transposition of two /adjacent/ characters.
-- See also:
-- <https://en.wikipedia.org/wiki/Damerau%E2%80%93Levenshtein_distance>
--
damerauLevenshtein :: Text -> Text -> Int
damerauLevenshtein = DTM.damerauLevenshtein
-- damerau-Levenshtein distance normalized
--
damerauLevenshteinNorm :: Text -> Text -> Ratio Int
damerauLevenshteinNorm = DTM.damerauLevenshteinNorm
-- Treating inputs like sets
-- | Return overlap coefficient for two 'Text' values. Returned value
-- is in the range from 0 (no similarity) to 1 (exact match). Return 1
-- if both 'Text' values are empty.
--
-- See also: <https://en.wikipedia.org/wiki/Overlap_coefficient>.
overlap :: Text -> Text -> Ratio Int
overlap = DTM.overlap
-- | Jaccard distance
-- measures dissimilarity between sample sets
jaccard :: Text -> Text -> Ratio Int
jaccard = DTM.jaccard
-- | Hamming Distance
-- In information theory, the Hamming distance between two strings of
-- equal length is the number of positions at which the corresponding
-- symbols are different. In other words, it measures the minimum number of
-- substitutions required to change one string into the other
-- See: https://en.wikipedia.org/wiki/Hamming_distance
hamming :: Text -> Text -> Maybe Int
hamming = DTM.hamming
--noApax :: Ord a => Map a Occ -> Map a Occ
--noApax m = M.filter (>1) m
{-|
Module : Gargantext.Text.Metrics.CharByChar
Description : All parsers of Gargantext in one file.
Copyright : (c) CNRS, 2017 - present
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
Mainly reexport functions in @Data.Text.Metrics@
-}
{-# LANGUAGE NoImplicitPrelude #-}
module Gargantext.Text.Metrics.CharByChar (levenshtein
, levenshteinNorm
, damerauLevenshtein
, damerauLevenshteinNorm
, overlap
, jaccard
, hamming
) where
import Data.Text (Text)
import GHC.Real (Ratio)
import qualified Data.Text.Metrics as DTM
import Gargantext.Prelude
--noApax :: Ord a => Map a Occ -> Map a Occ
--noApax m = M.filter (>1) m
{- * Example de titre
-}
-- | This module provide metrics to compare Text
-- starting as an API rexporting main functions of the great lib
-- text-metrics of Mark Karpov
-- | Levenshtein Distance
-- In information theory, Linguistics and computer science,
-- the Levenshtein distance is a string metric for measuring
-- the difference between two sequences.
-- See: https://en.wikipedia.org/wiki/Levenshtein_distance
--
levenshtein :: Text -> Text -> Int
levenshtein = DTM.levenshtein
-- | Return normalized Levenshtein distance between two 'Text' values.
-- Result is a non-negative rational number (represented as @'Ratio'
-- 'Data.Numeric.Natural'@), where 0 signifies no similarity between the
-- strings, while 1 means exact match.
--
levenshteinNorm :: Text -> Text -> Ratio Int
levenshteinNorm = DTM.levenshteinNorm
-- | Return Damerau-Levenshtein distance between two 'Text' values. The
-- function works like 'levenshtein', but the collection of allowed
-- operations also includes transposition of two /adjacent/ characters.
-- See also:
-- <https://en.wikipedia.org/wiki/Damerau%E2%80%93Levenshtein_distance>
--
damerauLevenshtein :: Text -> Text -> Int
damerauLevenshtein = DTM.damerauLevenshtein
-- damerau-Levenshtein distance normalized
--
damerauLevenshteinNorm :: Text -> Text -> Ratio Int
damerauLevenshteinNorm = DTM.damerauLevenshteinNorm
-- Treating inputs like sets
-- | Return overlap coefficient for two 'Text' values. Returned value
-- is in the range from 0 (no similarity) to 1 (exact match). Return 1
-- if both 'Text' values are empty.
--
-- See also: <https://en.wikipedia.org/wiki/Overlap_coefficient>.
overlap :: Text -> Text -> Ratio Int
overlap = DTM.overlap
-- | Jaccard distance
-- measures dissimilarity between sample sets
jaccard :: Text -> Text -> Ratio Int
jaccard = DTM.jaccard
-- | Hamming Distance
-- In information theory, the Hamming distance between two strings of
-- equal length is the number of positions at which the corresponding
-- symbols are different. In other words, it measures the minimum number of
-- substitutions required to change one string into the other
-- See: https://en.wikipedia.org/wiki/Hamming_distance
hamming :: Text -> Text -> Maybe Int
hamming = DTM.hamming
......@@ -37,6 +37,7 @@ import Data.Either.Extra(Either())
import Data.Text (Text)
import Data.Text.Encoding (decodeUtf8)
import qualified Data.Text as DT
----
--import Control.Monad (join)
import Codec.Archive.Zip (withArchive, getEntry, getEntries)
......@@ -108,4 +109,10 @@ openZip fp = do
bs <- mapConcurrently (\s -> withArchive path (getEntry s)) entries
pure bs
clean :: Text -> Text
clean txt = DT.map clean' txt
where
clean' '’' = '\''
clean' c = c
......@@ -19,8 +19,6 @@ Using Latin numerical prefixes, an n-gram of size 1 is referred to as a
Source: https://en.wikipedia.org/wiki/Ngrams
TODO
-- Prelude.concat <$> Prelude.map (filter (\n -> _my_token_pos n == Just NP)) <$> extractNgrams Gargantext.Core.EN testText_en
group Ngrams -> Tree
compute occ by node of Tree
group occs according groups
......@@ -35,10 +33,27 @@ compute graph
module Gargantext.Text.Terms
where
import Data.Text (Text)
import Gargantext.Prelude
import Gargantext.Core
import Gargantext.Core.Types
import Gargantext.Text.Terms.Multi (multiterms)
import Gargantext.Text.Terms.Mono (monoterms')
data TermType = Mono | Multi
------------------------------------------------------------------------
tokenTag2terms :: TokenTag -> Terms
tokenTag2terms (TokenTag w t _ _) = Terms w t
terms :: TermType -> Maybe Lang -> Text -> IO [Terms]
terms Mono (Just lang) txt = pure $ monoterms' lang txt
terms Multi (Just lang ) txt = multiterms lang txt
terms _ Nothing _ = panic "Lang needed"
------------------------------------------------------------------------
termTests :: Text
termTests = "It is hard to detect important articles in a specific context. Information retrieval techniques based on full text search can be inaccurate to identify main topics and they are not able to provide an indication about the importance of the article. Generating a citation network is a good way to find most popular articles but this approach is not context aware. The text around a citation mark is generally a good summary of the referred article. So citation context analysis presents an opportunity to use the wisdom of crowd for detecting important articles in a context sensitive way. In this work, we analyze citation contexts to rank articles properly for a given topic. The model proposed uses citation contexts in order to create a directed and edge-labeled citation network based on the target topic. Then we apply common ranking algorithms in order to find important articles in this newly created network. We showed that this method successfully detects a good subset of most prominent articles in a given topic. The biggest contribution of this approach is that we are able to identify important articles for a given search term even though these articles do not contain this search term. This technique can be used in other linked documents including web pages, legal documents, and patents as well as scientific papers."
......@@ -13,7 +13,7 @@ Mono-terms are Nterms where n == 1.
{-# LANGUAGE NoImplicitPrelude #-}
module Gargantext.Text.Terms.Mono
module Gargantext.Text.Terms.Mono (monoterms, monoterms')
where
import Data.Text (Text, toLower, split, splitOn, pack)
......@@ -26,12 +26,14 @@ import Gargantext.Text.Terms.Mono.Stem (stem)
import Gargantext.Prelude
import Data.Char (isAlphaNum, isSpace)
monoterms :: Text -> [Term]
monoterms' :: Lang -> Text -> [Terms]
monoterms' l txt = map (text2terms l) $ monoterms txt
monoterms :: Text -> [Text]
monoterms txt = map toLower $ split isWord txt
where
isWord c = c `elem` [' ', '\'', ',', ';']
text2terms :: Lang -> Text -> Terms
text2terms lang txt = Terms label stems
where
......
......@@ -23,11 +23,20 @@ module Gargantext.Text.Terms.Mono.Token (tokenize)
import Data.Text (Text)
import qualified Gargantext.Text.Terms.Mono.Token.En as En
import Gargantext.Core (Lang(..))
import Gargantext.Prelude
type Token = Text
-- >>> tokenize "A rose is a rose is a rose."
-- ["A","rose","is","a","rose","is","a","rose", "."]
--
data Context = Letter | Word | Sentence | Line | Paragraph
tokenize :: Text -> [Token]
tokenize = En.tokenize
tokenize' :: Lang -> Context -> [Token]
tokenize' = undefined
......@@ -13,10 +13,11 @@ Multi-terms are ngrams where n > 1.
{-# LANGUAGE NoImplicitPrelude #-}
module Gargantext.Text.Terms.Multi (extractTokenTags)
module Gargantext.Text.Terms.Multi (multiterms)
where
import Data.Text hiding (map, group)
import Data.Text hiding (map, group, filter, concat)
import Data.List (concat)
import Gargantext.Prelude
import Gargantext.Core (Lang(..))
......@@ -26,13 +27,21 @@ import Gargantext.Text.Terms.Multi.PosTagging
import qualified Gargantext.Text.Terms.Multi.Lang.En as En
import qualified Gargantext.Text.Terms.Multi.Lang.Fr as Fr
multiterms :: Lang -> Text -> IO [Terms]
multiterms lang txt = concat
<$> map (map tokenTag2terms)
<$> map (filter (\t -> _my_token_pos t == Just NP))
<$> tokenTags lang txt
extractTokenTags :: Lang -> Text -> IO [[TokenTag]]
extractTokenTags lang s = map (group lang) <$> extractTokenTags' lang s
tokenTag2terms :: TokenTag -> Terms
tokenTag2terms (TokenTag w t _ _) = Terms w t
tokenTags :: Lang -> Text -> IO [[TokenTag]]
tokenTags lang s = map (group lang) <$> tokenTags' lang s
extractTokenTags' :: Lang -> Text -> IO [[TokenTag]]
extractTokenTags' lang t = map tokens2tokensTags
tokenTags' :: Lang -> Text -> IO [[TokenTag]]
tokenTags' lang t = map tokens2tokensTags
<$> map _sentenceTokens
<$> _sentences
<$> corenlp lang t
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment