Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
haskell-gargantext
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Grégoire Locqueville
haskell-gargantext
Commits
1b65863d
Verified
Commit
1b65863d
authored
Apr 04, 2024
by
Przemyslaw Kaminski
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
[ngrams] brutal removal of all TermType constructors except for Multi
parent
01848abe
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
3 additions
and
54 deletions
+3
-54
Terms.hs
src/Gargantext/Core/Text/Terms.hs
+2
-53
Flow.hs
src/Gargantext/Database/Action/Flow.hs
+1
-1
No files found.
src/Gargantext/Core/Text/Terms.hs
View file @
1b65863d
...
...
@@ -47,13 +47,11 @@ import GHC.Base (String)
import
Gargantext.Core
(
Lang
,
NLPServerConfig
,
PosTagAlgo
)
import
Gargantext.Core.Text
(
sentences
,
HasText
(
..
))
import
Gargantext.Core.Text.Ngrams
(
Ngrams
(
..
),
NgramsType
(
..
),
ngramsTerms
)
import
Gargantext.Core.Text.Terms.Eleve
(
mainEleveWith
,
Tries
,
Token
,
buildTries
,
toToken
)
import
Gargantext.Core.Text.Terms.Mono
(
monoTerms
)
import
Gargantext.Core.Text.Terms.Eleve
(
Tries
,
Token
,
buildTries
,
toToken
)
import
Gargantext.Core.Text.Terms.Mono.Stem
(
stem
,
StemmingAlgorithm
(
..
))
import
Gargantext.Core.Text.Terms.Mono.Token.En
(
tokenize
)
import
Gargantext.Core.Text.Terms.Multi
(
multiterms
)
import
Gargantext.Core.Types
(
TermsCount
,
POS
,
Terms
(
Terms
),
TermsWithCount
)
import
Gargantext.Core.Utils
(
groupWithCounts
)
import
Gargantext.Database.Prelude
(
DBCmd
)
import
Gargantext.Database.Query.Table.Ngrams
(
insertNgrams
)
import
Gargantext.Database.Query.Table.NgramsPostag
(
NgramsPostag
(
..
),
insertNgramsPostag
,
np_form
,
np_lem
)
...
...
@@ -61,14 +59,7 @@ import Gargantext.Database.Schema.Ngrams (text2ngrams, NgramsId)
import
Gargantext.Prelude
data
TermType
lang
=
Mono
{
_tt_lang
::
!
lang
}
|
Multi
{
_tt_lang
::
!
lang
}
|
MonoMulti
{
_tt_lang
::
!
lang
}
|
Unsupervised
{
_tt_lang
::
!
lang
,
_tt_windowSize
::
!
Int
,
_tt_ngramsSize
::
!
Int
,
_tt_model
::
!
(
Maybe
(
Tries
Token
()
))
}
=
Multi
{
_tt_lang
::
!
lang
}
deriving
(
Generic
)
deriving
instance
(
Show
lang
)
=>
Show
(
TermType
lang
)
...
...
@@ -82,31 +73,9 @@ makeLenses ''TermType
-- | Sugar to extract terms from text (hidding 'mapM' from end user).
--extractTerms :: Traversable t => TermType Lang -> t Text -> IO (t [Terms])
extractTerms
::
NLPServerConfig
->
TermType
Lang
->
[
Text
]
->
IO
[[
TermsWithCount
]]
extractTerms
ncs
(
Unsupervised
{
..
})
xs
=
mapM
(
terms
ncs
(
Unsupervised
{
_tt_model
=
Just
m'
,
..
}))
xs
where
m'
=
case
_tt_model
of
Just
m''
->
m''
Nothing
->
newTries
_tt_windowSize
(
Text
.
intercalate
" "
xs
)
extractTerms
ncs
termTypeLang
xs
=
mapM
(
terms
ncs
termTypeLang
)
xs
------------------------------------------------------------------------
withLang
::
(
Foldable
t
,
Functor
t
,
HasText
h
)
=>
TermType
Lang
->
t
h
->
TermType
Lang
withLang
(
Unsupervised
{
..
})
ns
=
Unsupervised
{
_tt_model
=
m'
,
..
}
where
m'
=
case
_tt_model
of
Nothing
->
-- trace ("buildTries here" :: String)
Just
$
buildTries
_tt_ngramsSize
$
fmap
toToken
$
uniText
$
Text
.
intercalate
" . "
$
concatMap
hasText
ns
just_m
->
just_m
withLang
l
_
=
l
------------------------------------------------------------------------
data
ExtractedNgrams
=
SimpleNgrams
{
unSimpleNgrams
::
Ngrams
}
|
EnrichedNgrams
{
unEnrichedNgrams
::
NgramsPostag
}
...
...
@@ -172,12 +141,7 @@ isSimpleNgrams _ = False
-- 'MonoMulti' : mono and multi
-- TODO : multi terms should exclude mono (intersection is not empty yet)
terms
::
NLPServerConfig
->
TermType
Lang
->
Text
->
IO
[
TermsWithCount
]
terms
_
(
Mono
lang
)
txt
=
pure
$
monoTerms
lang
txt
terms
ncs
(
Multi
lang
)
txt
=
multiterms
ncs
lang
txt
terms
ncs
(
MonoMulti
lang
)
txt
=
terms
ncs
(
Multi
lang
)
txt
terms
_
(
Unsupervised
{
..
})
txt
=
pure
$
termsUnsupervised
(
Unsupervised
{
_tt_model
=
Just
m'
,
..
})
txt
where
m'
=
maybe
(
newTries
_tt_ngramsSize
txt
)
identity
_tt_model
-- terms (WithList list) txt = pure . concat $ extractTermsWithList list txt
...
...
@@ -185,21 +149,6 @@ terms _ (Unsupervised { .. }) txt = pure $ termsUnsupervised (Unsupervised { _
type
WindowSize
=
Int
type
MinNgramSize
=
Int
-- | Unsupervised ngrams extraction
-- language agnostic extraction
-- TODO: newtype BlockText
termsUnsupervised
::
TermType
Lang
->
Text
->
[
TermsWithCount
]
termsUnsupervised
(
Unsupervised
{
_tt_model
=
Nothing
})
=
panicTrace
"[termsUnsupervised] no model"
termsUnsupervised
(
Unsupervised
{
_tt_model
=
Just
_tt_model
,
..
})
=
map
(
first
(
text2term
_tt_lang
))
.
groupWithCounts
-- . List.nub
.
List
.
filter
(
\
l'
->
List
.
length
l'
>=
_tt_windowSize
)
.
List
.
concat
.
mainEleveWith
_tt_model
_tt_ngramsSize
.
uniText
termsUnsupervised
_
=
undefined
newTries
::
Int
->
Text
->
Tries
Token
()
newTries
n
t
=
buildTries
n
(
toToken
<$>
uniText
t
)
...
...
src/Gargantext/Database/Action/Flow.hs
View file @
1b65863d
...
...
@@ -421,7 +421,7 @@ insertMasterDocs ncs c lang hs = do
mapNgramsDocs'
::
HashMap
.
HashMap
ExtractedNgrams
(
Map
NgramsType
(
Map
NodeId
(
Int
,
TermsCount
)))
<-
mapNodeIdNgrams
<$>
documentIdWithNgrams
(
extractNgramsT
ncs
$
withLang
lang
documentsWithId
)
(
extractNgramsT
ncs
lang
)
(
map
(
B
.
first
contextId2NodeId
)
documentsWithId
)
lId
<-
getOrMkList
masterCorpusId
masterUserId
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment