Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
S
spacy-server
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
1
Issues
1
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
gargantext
spacy-server
Commits
2d1d7606
Unverified
Commit
2d1d7606
authored
Dec 24, 2019
by
Neel Kamath
Committed by
GitHub
Dec 24, 2019
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
Fix #24 (#25)
parent
2cb8d483
Changes
6
Show whitespace changes
Inline
Side-by-side
Showing
6 changed files
with
287 additions
and
25 deletions
+287
-25
openapi.yaml
docs/openapi.yaml
+91
-0
requirements.txt
requirements.txt
+1
-1
main.py
src/main.py
+67
-24
compute_phrases.json
src/outputs/compute_phrases.json
+42
-0
sense2vec.json
src/outputs/sense2vec.json
+44
-0
test_main.py
src/test_main.py
+42
-0
No files found.
docs/openapi.yaml
View file @
2d1d7606
...
...
@@ -206,6 +206,71 @@ paths:
detail
:
There is no sense2vec model bundled with this service.
schema
:
$ref
:
'
#/components/schemas/InvalidModel'
/sense2vec
:
post
:
tags
:
[
nlp
]
description
:
Compute phrases similar to a phrase in a sentence. sense2vec must be bundled with the service, and
the pretrained model must have the `ner` and `parser` pipeline components.
operationId
:
sense2vec
requestBody
:
required
:
true
description
:
The phrase in the sentence
content
:
application/json
:
example
:
sentence
:
Bill Gates founded Microsoft in April 4, 1975.
phrase
:
Bill Gates
schema
:
$ref
:
'
#/components/schemas/SentenceWithPhrase'
responses
:
'
200'
:
description
:
Computed phrases
content
:
application/json
:
example
:
sense2vec
:
-
phrase
:
Mark Zuckerberg
similarity
:
0.850600004196167
-
phrase
:
Warren Buffet
similarity
:
0.8501999974250793
-
phrase
:
Warren Buffett
similarity
:
0.8375999927520752
-
phrase
:
bill gates
similarity
:
0.8215000033378601
-
phrase
:
Steve Jobs
similarity
:
0.8180999755859375
-
phrase
:
Zuckerberg
similarity
:
0.8163999915122986
-
phrase
:
Elon Musk
similarity
:
0.8140000104904175
-
phrase
:
Bill gates
similarity
:
0.8119999766349792
-
phrase
:
billionaire
similarity
:
0.8116999864578247
-
phrase
:
Elon Musk
similarity
:
0.8011999726295471
schema
:
$ref
:
'
#/components/schemas/Sense2vecPhrases'
'
400'
:
description
:
sense2vec is disabled, or the pretrained model lacks the `ner` or `parser` pipeline components.
content
:
application/json
:
examples
:
invalid_model
:
summary
:
The spaCy model lacks the required pipeline components.
value
:
detail
:
The pretrained model (en_trf_bertbaseuncased_lg) doesn't support named entity recognition.
sense2vec_disabled
:
summary
:
Similar phrases via sense2vec were requested, but a sense2vec model wasn't bundled with the
service.
value
:
detail
:
There is no sense2vec model bundled with this service.
phrase_nonexistent
:
summary
:
The phrase isn't present in the sentence.
value
:
detail
:
phrase must be in sentence
schema
:
$ref
:
'
#/components/schemas/InvalidModel'
/pos
:
post
:
tags
:
[
nlp
]
...
...
@@ -825,6 +890,32 @@ components:
text
:
type
:
string
required
:
[
text
]
SentenceWithPhrase
:
type
:
object
properties
:
sentence
:
type
:
string
description
:
The sentence containing the phrase.
phrase
:
type
:
string
description
:
sense2vec will be run only on this phrase.
required
:
[
sentence
,
phrase
]
Sense2vecPhrases
:
type
:
object
properties
:
sense2vec
:
type
:
array
description
:
Phrases similar to the entity
items
:
type
:
object
properties
:
phrase
:
type
:
string
similarity
:
type
:
number
description
:
Similarity in the range of 0-1
required
:
[
phrase
,
similarity
]
required
:
[
sense2vec
]
PartsOfSpeech
:
type
:
object
properties
:
...
...
requirements.txt
View file @
2d1d7606
...
...
@@ -4,4 +4,4 @@ sense2vec==1.0.2
fastapi
==0.45.0
uvicorn
==0.10.8
pytest
>=4.6.7,<5
\ No newline at end of file
pytest
>=5.3.2,<6
\ No newline at end of file
src/main.py
View file @
2d1d7606
"""Provides NLP via spaCy and sense2vec over an HTTP API."""
# Class methods annotated with <@pydantic.root_validator> must not be additionally annotated with <@classmethod> because
# it break exception handling.
import
os
import
typing
...
...
@@ -21,6 +24,16 @@ if os.getenv('SENSE2VEC') == '1':
)
def
enforce_components
(
components
:
typing
.
List
[
str
],
message
:
str
)
->
None
:
"""Throws the <message> if the model doesn't have the <components>."""
for
component
in
components
:
if
not
nlp
.
has_pipe
(
component
):
raise
fastapi
.
HTTPException
(
status_code
=
400
,
detail
=
pipeline_error
.
format
(
message
)
)
class
NERRequest
(
pydantic
.
BaseModel
):
sections
:
typing
.
List
[
str
]
sense2vec
:
bool
=
False
...
...
@@ -28,15 +41,11 @@ class NERRequest(pydantic.BaseModel):
@
app
.
post
(
'/ner'
)
async
def
recognize_named_entities
(
request
:
NERRequest
):
if
not
nlp
.
has_pipe
(
'ner'
)
or
not
nlp
.
has_pipe
(
'parser'
):
raise
fastapi
.
HTTPException
(
status_code
=
400
,
detail
=
pipeline_error
.
format
(
'named entity recognition'
)
)
if
request
.
sense2vec
and
not
nlp
.
has_pipe
(
'sense2vec'
):
raise
fastapi
.
HTTPException
(
status_code
=
400
,
detail
=
'There is no sense2vec model bundled with this service.'
enforce_components
([
'ner'
,
'parser'
],
'named entity recognition'
)
if
request
.
sense2vec
:
enforce_components
(
[
'sense2vec'
],
'There is no sense2vec model bundled with this service.'
)
response
=
{
'data'
:
[]}
for
doc
in
nlp
.
pipe
(
request
.
sections
,
disable
=
[
'tagger'
]):
...
...
@@ -49,13 +58,31 @@ async def recognize_named_entities(request: NERRequest):
return
response
def
build_entity
(
ent
,
use_sense2vec
):
class
SimilarPhrase
(
pydantic
.
BaseModel
):
"""Similar phrases computed by sense2vec."""
"""The similar phrase."""
phrase
:
str
"""The phrase's similarity in the range of 0-1."""
similarity
:
float
def
compute_phrases
(
ent
)
->
typing
.
List
[
SimilarPhrase
]:
"""Computes similar phrases for the entity (<ent>).
The entity must have already been processed by the ner, parser, and
sense2vec pipeline components.
"""
similar
=
[]
if
use_sense2vec
and
ent
.
_
.
in_s2v
:
if
ent
.
_
.
in_s2v
:
for
data
in
ent
.
_
.
s2v_most_similar
():
similar
.
append
(
{
'phrase'
:
data
[
0
][
0
],
'similarity'
:
float
(
data
[
1
])}
SimilarPhrase
(
phrase
=
data
[
0
][
0
],
similarity
=
float
(
data
[
1
]))
)
return
similar
def
build_entity
(
ent
:
spacy
,
use_sense2vec
:
bool
):
return
{
'text'
:
ent
.
text
,
'label'
:
ent
.
label_
,
...
...
@@ -65,22 +92,42 @@ def build_entity(ent, use_sense2vec):
'start'
:
ent
.
start
,
'end'
:
ent
.
end
,
'text_with_ws'
:
ent
.
text_with_ws
,
'sense2vec'
:
similar
,
'sense2vec'
:
compute_phrases
(
ent
)
if
use_sense2vec
else
[]
,
}
class
PhraseInSentence
(
pydantic
.
BaseModel
):
"""A <phrase> in a <sentence>."""
sentence
:
str
phrase
:
str
@
pydantic
.
root_validator
def
check_passwords_match
(
cls
,
values
):
if
values
.
get
(
'phrase'
)
not
in
values
.
get
(
'sentence'
):
raise
fastapi
.
HTTPException
(
status_code
=
400
,
detail
=
'phrase must be in sentence'
)
return
values
@
app
.
post
(
'/sense2vec'
)
async
def
sense2vec
(
request
:
PhraseInSentence
):
enforce_components
([
'ner'
,
'parser'
,
'sense2vec'
],
'sense2vec'
)
doc
=
nlp
(
request
.
sentence
,
disable
=
[
'tagger'
])
for
ent
in
list
(
doc
.
sents
)[
0
]
.
ents
:
if
ent
.
text
==
request
.
phrase
:
return
{
'sense2vec'
:
compute_phrases
(
ent
)}
class
TextModel
(
pydantic
.
BaseModel
):
text
:
str
@
app
.
post
(
'/pos'
)
async
def
tag_parts_of_speech
(
request
:
TextModel
):
if
(
not
nlp
.
has_pipe
(
'ner'
)
or
not
nlp
.
has_pipe
(
'parser'
)
or
not
nlp
.
has_pipe
(
'tagger'
)):
raise
fastapi
.
HTTPException
(
status_code
=
400
,
detail
=
pipeline_error
.
format
(
'part-of-speech tagging'
)
)
enforce_components
([
'ner'
,
'parser'
,
'tagger'
],
'part-of-speech tagging'
)
data
=
[]
doc
=
nlp
(
request
.
text
,
disable
=
[
'sense2vec'
])
for
token
in
[
build_token
(
token
)
for
token
in
doc
]:
...
...
@@ -146,11 +193,7 @@ async def tokenize(request: TextModel):
@
app
.
post
(
'/sentencizer'
)
async
def
sentencize
(
request
:
TextModel
):
if
not
nlp
.
has_pipe
(
'parser'
):
raise
fastapi
.
HTTPException
(
status_code
=
400
,
detail
=
pipeline_error
.
format
(
'sentence segmentation'
)
)
enforce_components
([
'parser'
],
'sentence segmentation'
)
doc
=
nlp
(
request
.
text
,
disable
=
[
'tagger'
,
'ner'
,
'sense2vec'
])
return
{
'sentences'
:
[
sent
.
text
for
sent
in
doc
.
sents
]}
...
...
src/outputs/compute_phrases.json
0 → 100644
View file @
2d1d7606
[
{
"phrase"
:
"Mark Zuckerberg"
,
"similarity"
:
0.850600004196167
},
{
"phrase"
:
"Warren Buffet"
,
"similarity"
:
0.8501999974250793
},
{
"phrase"
:
"Warren Buffett"
,
"similarity"
:
0.8375999927520752
},
{
"phrase"
:
"bill gates"
,
"similarity"
:
0.8215000033378601
},
{
"phrase"
:
"Steve Jobs"
,
"similarity"
:
0.8180999755859375
},
{
"phrase"
:
"Zuckerberg"
,
"similarity"
:
0.8163999915122986
},
{
"phrase"
:
"Elon Musk"
,
"similarity"
:
0.8140000104904175
},
{
"phrase"
:
"Bill gates"
,
"similarity"
:
0.8119999766349792
},
{
"phrase"
:
"billionaire"
,
"similarity"
:
0.8116999864578247
},
{
"phrase"
:
"Elon Musk"
,
"similarity"
:
0.8011999726295471
}
]
\ No newline at end of file
src/outputs/sense2vec.json
0 → 100644
View file @
2d1d7606
{
"sense2vec"
:
[
{
"phrase"
:
"Mark Zuckerberg"
,
"similarity"
:
0.850600004196167
},
{
"phrase"
:
"Warren Buffet"
,
"similarity"
:
0.8501999974250793
},
{
"phrase"
:
"Warren Buffett"
,
"similarity"
:
0.8375999927520752
},
{
"phrase"
:
"bill gates"
,
"similarity"
:
0.8215000033378601
},
{
"phrase"
:
"Steve Jobs"
,
"similarity"
:
0.8180999755859375
},
{
"phrase"
:
"Zuckerberg"
,
"similarity"
:
0.8163999915122986
},
{
"phrase"
:
"Elon Musk"
,
"similarity"
:
0.8140000104904175
},
{
"phrase"
:
"Bill gates"
,
"similarity"
:
0.8119999766349792
},
{
"phrase"
:
"billionaire"
,
"similarity"
:
0.8116999864578247
},
{
"phrase"
:
"Elon Musk"
,
"similarity"
:
0.8011999726295471
}
]
}
\ No newline at end of file
src/test_main.py
View file @
2d1d7606
import
json
import
fastapi
import
main
import
pytest
import
starlette.testclient
client
=
starlette
.
testclient
.
TestClient
(
main
.
app
)
...
...
@@ -36,6 +38,26 @@ def test_ner_sense2vec_fail():
fail
(
'/ner'
,
ner_sense2vec_body
,
'sense2vec'
)
def
test_sense2vec_success
():
body
=
{
'sentence'
:
'Bill Gates founded Microsoft in April 4, 1975.'
,
'phrase'
:
'Bill Gates'
}
response
=
client
.
post
(
'/sense2vec'
,
json
=
body
)
assert
response
.
status_code
==
200
with
open
(
'src/outputs/sense2vec.json'
)
as
f
:
assert
response
.
json
()
==
json
.
load
(
f
)
def
test_sense2vec_fail
():
response
=
client
.
post
(
'/sense2vec'
,
json
=
{
'sentence'
:
'My name is John Doe.'
,
'phrase'
:
'Johnny Doe'
}
)
assert
response
.
status_code
==
400
assert
response
.
json
()[
'detail'
]
==
'phrase must be in sentence'
pos_body
=
{
'text'
:
'Apple is looking at buying U.K. startup for $1 billion'
}
...
...
@@ -84,3 +106,23 @@ def fail(endpoint, body, pipe):
response
=
client
.
post
(
endpoint
,
json
=
body
)
assert
response
.
status_code
==
400
assert
'detail'
in
response
.
json
()
def
test_enforce_components
():
with
pytest
.
raises
(
fastapi
.
HTTPException
):
component
=
'nonexistent_component'
main
.
enforce_components
([
component
],
component
)
def
test_compute_phrases
():
sentence
=
'Bill Gates founded Microsoft in April 4, 1975.'
doc
=
main
.
nlp
(
sentence
,
disable
=
[
'tagger'
])
for
ent
in
list
(
doc
.
sents
)[
0
]
.
ents
:
if
ent
.
text
==
'Bill Gates'
:
with
open
(
'src/outputs/compute_phrases.json'
)
as
f
:
assert
main
.
compute_phrases
(
ent
)
==
json
.
load
(
f
)
def
test_phrase_in_sentence
():
with
pytest
.
raises
(
fastapi
.
HTTPException
):
main
.
PhraseInSentence
(
sentence
=
'My name is John.'
,
phrase
=
'Johnny'
)
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment