Unverified Commit 2d1d7606 authored by Neel Kamath's avatar Neel Kamath Committed by GitHub

Fix #24 (#25)

parent 2cb8d483
......@@ -206,6 +206,71 @@ paths:
detail: There is no sense2vec model bundled with this service.
schema:
$ref: '#/components/schemas/InvalidModel'
/sense2vec:
post:
tags: [nlp]
description: Compute phrases similar to a phrase in a sentence. sense2vec must be bundled with the service, and
the pretrained model must have the `ner` and `parser` pipeline components.
operationId: sense2vec
requestBody:
required: true
description: The phrase in the sentence
content:
application/json:
example:
sentence: Bill Gates founded Microsoft in April 4, 1975.
phrase: Bill Gates
schema:
$ref: '#/components/schemas/SentenceWithPhrase'
responses:
'200':
description: Computed phrases
content:
application/json:
example:
sense2vec:
- phrase: Mark Zuckerberg
similarity: 0.850600004196167
- phrase: Warren Buffet
similarity: 0.8501999974250793
- phrase: Warren Buffett
similarity: 0.8375999927520752
- phrase: bill gates
similarity: 0.8215000033378601
- phrase: Steve Jobs
similarity: 0.8180999755859375
- phrase: Zuckerberg
similarity: 0.8163999915122986
- phrase: Elon Musk
similarity: 0.8140000104904175
- phrase: Bill gates
similarity: 0.8119999766349792
- phrase: billionaire
similarity: 0.8116999864578247
- phrase: Elon Musk
similarity: 0.8011999726295471
schema:
$ref: '#/components/schemas/Sense2vecPhrases'
'400':
description: sense2vec is disabled, or the pretrained model lacks the `ner` or `parser` pipeline components.
content:
application/json:
examples:
invalid_model:
summary: The spaCy model lacks the required pipeline components.
value:
detail: The pretrained model (en_trf_bertbaseuncased_lg) doesn't support named entity recognition.
sense2vec_disabled:
summary: Similar phrases via sense2vec were requested, but a sense2vec model wasn't bundled with the
service.
value:
detail: There is no sense2vec model bundled with this service.
phrase_nonexistent:
summary: The phrase isn't present in the sentence.
value:
detail: phrase must be in sentence
schema:
$ref: '#/components/schemas/InvalidModel'
/pos:
post:
tags: [nlp]
......@@ -825,6 +890,32 @@ components:
text:
type: string
required: [text]
SentenceWithPhrase:
type: object
properties:
sentence:
type: string
description: The sentence containing the phrase.
phrase:
type: string
description: sense2vec will be run only on this phrase.
required: [sentence, phrase]
Sense2vecPhrases:
type: object
properties:
sense2vec:
type: array
description: Phrases similar to the entity
items:
type: object
properties:
phrase:
type: string
similarity:
type: number
description: Similarity in the range of 0-1
required: [phrase, similarity]
required: [sense2vec]
PartsOfSpeech:
type: object
properties:
......
......@@ -4,4 +4,4 @@ sense2vec==1.0.2
fastapi==0.45.0
uvicorn==0.10.8
pytest>=4.6.7,<5
\ No newline at end of file
pytest>=5.3.2,<6
\ No newline at end of file
"""Provides NLP via spaCy and sense2vec over an HTTP API."""
# Class methods annotated with <@pydantic.root_validator> must not be additionally annotated with <@classmethod> because
# it break exception handling.
import os
import typing
......@@ -21,6 +24,16 @@ if os.getenv('SENSE2VEC') == '1':
)
def enforce_components(components: typing.List[str], message: str) -> None:
"""Throws the <message> if the model doesn't have the <components>."""
for component in components:
if not nlp.has_pipe(component):
raise fastapi.HTTPException(
status_code=400,
detail=pipeline_error.format(message)
)
class NERRequest(pydantic.BaseModel):
sections: typing.List[str]
sense2vec: bool = False
......@@ -28,15 +41,11 @@ class NERRequest(pydantic.BaseModel):
@app.post('/ner')
async def recognize_named_entities(request: NERRequest):
if not nlp.has_pipe('ner') or not nlp.has_pipe('parser'):
raise fastapi.HTTPException(
status_code=400,
detail=pipeline_error.format('named entity recognition')
)
if request.sense2vec and not nlp.has_pipe('sense2vec'):
raise fastapi.HTTPException(
status_code=400,
detail='There is no sense2vec model bundled with this service.'
enforce_components(['ner', 'parser'], 'named entity recognition')
if request.sense2vec:
enforce_components(
['sense2vec'],
'There is no sense2vec model bundled with this service.'
)
response = {'data': []}
for doc in nlp.pipe(request.sections, disable=['tagger']):
......@@ -49,13 +58,31 @@ async def recognize_named_entities(request: NERRequest):
return response
def build_entity(ent, use_sense2vec):
class SimilarPhrase(pydantic.BaseModel):
"""Similar phrases computed by sense2vec."""
"""The similar phrase."""
phrase: str
"""The phrase's similarity in the range of 0-1."""
similarity: float
def compute_phrases(ent) -> typing.List[SimilarPhrase]:
"""Computes similar phrases for the entity (<ent>).
The entity must have already been processed by the ner, parser, and
sense2vec pipeline components.
"""
similar = []
if use_sense2vec and ent._.in_s2v:
if ent._.in_s2v:
for data in ent._.s2v_most_similar():
similar.append(
{'phrase': data[0][0], 'similarity': float(data[1])}
SimilarPhrase(phrase=data[0][0], similarity=float(data[1]))
)
return similar
def build_entity(ent: spacy, use_sense2vec: bool):
return {
'text': ent.text,
'label': ent.label_,
......@@ -65,22 +92,42 @@ def build_entity(ent, use_sense2vec):
'start': ent.start,
'end': ent.end,
'text_with_ws': ent.text_with_ws,
'sense2vec': similar,
'sense2vec': compute_phrases(ent) if use_sense2vec else [],
}
class PhraseInSentence(pydantic.BaseModel):
"""A <phrase> in a <sentence>."""
sentence: str
phrase: str
@pydantic.root_validator
def check_passwords_match(cls, values):
if values.get('phrase') not in values.get('sentence'):
raise fastapi.HTTPException(
status_code=400,
detail='phrase must be in sentence'
)
return values
@app.post('/sense2vec')
async def sense2vec(request: PhraseInSentence):
enforce_components(['ner', 'parser', 'sense2vec'], 'sense2vec')
doc = nlp(request.sentence, disable=['tagger'])
for ent in list(doc.sents)[0].ents:
if ent.text == request.phrase:
return {'sense2vec': compute_phrases(ent)}
class TextModel(pydantic.BaseModel):
text: str
@app.post('/pos')
async def tag_parts_of_speech(request: TextModel):
if (not nlp.has_pipe('ner') or not nlp.has_pipe('parser')
or not nlp.has_pipe('tagger')):
raise fastapi.HTTPException(
status_code=400,
detail=pipeline_error.format('part-of-speech tagging')
)
enforce_components(['ner', 'parser', 'tagger'], 'part-of-speech tagging')
data = []
doc = nlp(request.text, disable=['sense2vec'])
for token in [build_token(token) for token in doc]:
......@@ -146,11 +193,7 @@ async def tokenize(request: TextModel):
@app.post('/sentencizer')
async def sentencize(request: TextModel):
if not nlp.has_pipe('parser'):
raise fastapi.HTTPException(
status_code=400,
detail=pipeline_error.format('sentence segmentation')
)
enforce_components(['parser'], 'sentence segmentation')
doc = nlp(request.text, disable=['tagger', 'ner', 'sense2vec'])
return {'sentences': [sent.text for sent in doc.sents]}
......
[
{
"phrase": "Mark Zuckerberg",
"similarity": 0.850600004196167
},
{
"phrase": "Warren Buffet",
"similarity": 0.8501999974250793
},
{
"phrase": "Warren Buffett",
"similarity": 0.8375999927520752
},
{
"phrase": "bill gates",
"similarity": 0.8215000033378601
},
{
"phrase": "Steve Jobs",
"similarity": 0.8180999755859375
},
{
"phrase": "Zuckerberg",
"similarity": 0.8163999915122986
},
{
"phrase": "Elon Musk",
"similarity": 0.8140000104904175
},
{
"phrase": "Bill gates",
"similarity": 0.8119999766349792
},
{
"phrase": "billionaire",
"similarity": 0.8116999864578247
},
{
"phrase": "Elon Musk",
"similarity": 0.8011999726295471
}
]
\ No newline at end of file
{
"sense2vec": [
{
"phrase": "Mark Zuckerberg",
"similarity": 0.850600004196167
},
{
"phrase": "Warren Buffet",
"similarity": 0.8501999974250793
},
{
"phrase": "Warren Buffett",
"similarity": 0.8375999927520752
},
{
"phrase": "bill gates",
"similarity": 0.8215000033378601
},
{
"phrase": "Steve Jobs",
"similarity": 0.8180999755859375
},
{
"phrase": "Zuckerberg",
"similarity": 0.8163999915122986
},
{
"phrase": "Elon Musk",
"similarity": 0.8140000104904175
},
{
"phrase": "Bill gates",
"similarity": 0.8119999766349792
},
{
"phrase": "billionaire",
"similarity": 0.8116999864578247
},
{
"phrase": "Elon Musk",
"similarity": 0.8011999726295471
}
]
}
\ No newline at end of file
import json
import fastapi
import main
import pytest
import starlette.testclient
client = starlette.testclient.TestClient(main.app)
......@@ -36,6 +38,26 @@ def test_ner_sense2vec_fail():
fail('/ner', ner_sense2vec_body, 'sense2vec')
def test_sense2vec_success():
body = {
'sentence': 'Bill Gates founded Microsoft in April 4, 1975.',
'phrase': 'Bill Gates'
}
response = client.post('/sense2vec', json=body)
assert response.status_code == 200
with open('src/outputs/sense2vec.json') as f:
assert response.json() == json.load(f)
def test_sense2vec_fail():
response = client.post(
'/sense2vec',
json={'sentence': 'My name is John Doe.', 'phrase': 'Johnny Doe'}
)
assert response.status_code == 400
assert response.json()['detail'] == 'phrase must be in sentence'
pos_body = {'text': 'Apple is looking at buying U.K. startup for $1 billion'}
......@@ -84,3 +106,23 @@ def fail(endpoint, body, pipe):
response = client.post(endpoint, json=body)
assert response.status_code == 400
assert 'detail' in response.json()
def test_enforce_components():
with pytest.raises(fastapi.HTTPException):
component = 'nonexistent_component'
main.enforce_components([component], component)
def test_compute_phrases():
sentence = 'Bill Gates founded Microsoft in April 4, 1975.'
doc = main.nlp(sentence, disable=['tagger'])
for ent in list(doc.sents)[0].ents:
if ent.text == 'Bill Gates':
with open('src/outputs/compute_phrases.json') as f:
assert main.compute_phrases(ent) == json.load(f)
def test_phrase_in_sentence():
with pytest.raises(fastapi.HTTPException):
main.PhraseInSentence(sentence='My name is John.', phrase='Johnny')
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment