1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
from node.models import Node, NodeType, User, Language, ResourceType
from parsing.Caches import Caches
try:
me = User.objects.get(username='Mat')
except:
me = User(username='Mat')
me.save()
try:
typePubmed = ResourceType.get(name='pubmed')
except:
typePubmed = ResourceType(name='pubmed')
typePubmed.save()
try:
typeCorpus = NodeType.get(name='corpus')
typeDoc = NodeType.get(name='document')
except:
typeCorpus = NodeType(name='corpus')
typeCorpus.save()
typeDoc = NodeType(name='document')
typeDoc.save()
english = Language.objects.get(iso2='en')
Node.objects.all().delete()
try:
corpus = Node.objects.get(name='My first corpus')
except:
corpus = Node(name='My first corpus', type=typeCorpus, user=me)
corpus.save()
print('Remove previously existing children of the corpus...')
corpus.children.all().delete()
print('Adding a resource to the corpus...')
corpus.add_resource(file='./data_samples/pubmed.zip', type=typePubmed)
print('Adding the corpus resources...')
corpus.parse_resources()
print('Extracting ngrams from the documents...')
corpus.children.all().extract_ngrams(['title', 'abstract'])