LangDetect_hs 3.27 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93


-- DEFINITIONS as SPECS 
-- (Engineering axioms for Gargantext)


------------------------------------------------------------------------
-- From file to corpus
------------------------------------------------------------------------

-- > A Corpus is a list of Documents
data Corpus   = [Document]

-- > A Document should have a date, some text and a maybe a language.
-- > Remarks : 
--         > If no date then force one ?
--         > Analyze either text or numbers
--         > only one language per document
data Document = Document { date     :: UTCTime
                         , uce      :: Map Text $ Either (Maybe Text) (Maybe Double)
                         , lang     :: Maybe Language
                         }

parseFiles :: Maybe ParserType -> [File] -> Corpus
parseFiles = undefined

-- This function exists already (in Python)
parseFile' :: ParserType -> File -> Maybe [Document]
parseFile' = undefined

-- This function does not exist yet 
parseFile :: Maybe ParserType -> File -> Maybe [Document]
parseFile parserType file = documents
    where
        documents = case parserType of
                      
                      Nothing           -> case guessParserType file of
                                            Nothing          -> askUser "Answer to the question with link to $doc"
                                            Just parserType' -> parseFile (Just parserType') file
                      
                      Just parserType'' -> case parserType''         of
                                            UnsupportedYet   -> askUser "Not supported yet, which priority ?"
                                            otherwise        -> parseFile' parserType'' file

data ParserType = RIS | ISI | XML | CSV | Europresse | Book | UnsupportedYet
guessParserType :: File -> Maybe ParserType
guessParserType = undefined


------------------------------------------------------------------------
-- What kind of interactions with our users ?
------------------------------------------------------------------------

-- Question is Text only
type Question = Text

-- Possible Answers:
data Answer = ClosedAnswer | NumAnswer | OpenAnswer
-- Definitions of the Answers
type ClosedAnswer   = Bool
type OpenAnswer     = Text
type NumAnswer      = Int
-- Un formulaire est un mapping entre question et peut-être une réponse
-- Un formulaire vide a Nothing au champs (Maybe Answer)
-- Une question répondue a la valeur (Just Response)
type Formular = Map Question (Maybe Answer)

askUser :: Question -> ClosedAnswer
askUser = undefined

data Advice = BugReport | WishList
askUser' :: Question -> Advice
askUser' question = case askUser question of
                      True  -> BugReport
                      False -> WishList


------------------------------------------------------------------------
-- Specs for Lang Detection
------------------------------------------------------------------------
data Language   = English | French 

tagDoc :: Document -> Ngrams
tagDoc doc = ngrams
    where
        ngrams = case lang doc of
                   Nothing -> case guessLang doc of
                                Nothing -> tag


------------------------------------------------------------------------
-- Specs for ngrams Worflow
------------------------------------------------------------------------