Fix #24 (#25)

2d1d7606 · Neel Kamath · GitHub · 2cb8d483 · 2d1d7606 · 2d1d7606
Unverified Commit 2d1d7606 authored Dec 24, 2019 by Neel Kamath Committed by GitHub Dec 24, 2019
6 changed files
--- a/docs/openapi.yaml
+++ b/docs/openapi.yaml
@@ -206,6 +206,71 @@ paths:
                    detail: There is no sense2vec model bundled with this service.
              schema:
                $ref: '#/components/schemas/InvalidModel'
+  /sense2vec:
+    post:
+      tags: [nlp]
+      description: Compute phrases similar to a phrase in a sentence. sense2vec must be bundled with the service, and
+        the pretrained model must have the `ner` and `parser` pipeline components.
+      operationId: sense2vec
+      requestBody:
+        required: true
+        description: The phrase in the sentence
+        content:
+          application/json:
+            example:
+              sentence: Bill Gates founded Microsoft in April 4, 1975.
+              phrase: Bill Gates
+            schema:
+              $ref: '#/components/schemas/SentenceWithPhrase'
+      responses:
+        '200':
+          description: Computed phrases
+          content:
+            application/json:
+              example:
+                sense2vec:
+                  - phrase: Mark Zuckerberg
+                    similarity: 0.850600004196167
+                  - phrase: Warren Buffet
+                    similarity: 0.8501999974250793
+                  - phrase: Warren Buffett
+                    similarity: 0.8375999927520752
+                  - phrase: bill gates
+                    similarity: 0.8215000033378601
+                  - phrase: Steve Jobs
+                    similarity: 0.8180999755859375
+                  - phrase: Zuckerberg
+                    similarity: 0.8163999915122986
+                  - phrase: Elon Musk
+                    similarity: 0.8140000104904175
+                  - phrase: Bill gates
+                    similarity: 0.8119999766349792
+                  - phrase: billionaire
+                    similarity: 0.8116999864578247
+                  - phrase: Elon Musk
+                    similarity: 0.8011999726295471
+              schema:
+                $ref: '#/components/schemas/Sense2vecPhrases'
+        '400':
+          description: sense2vec is disabled, or the pretrained model lacks the `ner` or `parser` pipeline components.
+          content:
+            application/json:
+              examples:
+                invalid_model:
+                  summary: The spaCy model lacks the required pipeline components.
+                  value:
+                    detail: The pretrained model (en_trf_bertbaseuncased_lg) doesn't support named entity recognition.
+                sense2vec_disabled:
+                  summary: Similar phrases via sense2vec were requested, but a sense2vec model wasn't bundled with the
+                    service.
+                  value:
+                    detail: There is no sense2vec model bundled with this service.
+                phrase_nonexistent:
+                  summary: The phrase isn't present in the sentence.
+                  value:
+                    detail: phrase must be in sentence
+              schema:
+                $ref: '#/components/schemas/InvalidModel'
  /pos:
    post:
      tags: [nlp]
@@ -825,6 +890,32 @@ components:
        text:
          type: string
      required: [text]
+    SentenceWithPhrase:
+      type: object
+      properties:
+        sentence:
+          type: string
+          description: The sentence containing the phrase.
+        phrase:
+          type: string
+          description: sense2vec will be run only on this phrase.
+      required: [sentence, phrase]
+    Sense2vecPhrases:
+      type: object
+      properties:
+        sense2vec:
+          type: array
+          description: Phrases similar to the entity
+          items:
+            type: object
+            properties:
+              phrase:
+                type: string
+              similarity:
+                type: number
+                description: Similarity in the range of 0-1
+            required: [phrase, similarity]
+      required: [sense2vec]
    PartsOfSpeech:
      type: object
      properties:

--- a/requirements.txt
+++ b/requirements.txt
@@ -4,4 +4,4 @@ sense2vec==1.0.2

 fastapi==0.45.0
 uvicorn==0.10.8
-pytest>=4.6.7,<5
\ No newline at end of file
+pytest>=5.3.2,<6
\ No newline at end of file
--- a/src/main.py
+++ b/src/main.py
 """Provides NLP via spaCy and sense2vec over an HTTP API."""

+# Class methods annotated with <@pydantic.root_validator> must not be additionally annotated with <@classmethod> because
+# it break exception handling.
+
 import os
 import typing

@@ -21,6 +24,16 @@ if os.getenv('SENSE2VEC') == '1':
    )


+def enforce_components(components: typing.List[str], message: str) -> None:
+    """Throws the <message> if the model doesn't have the <components>."""
+    for component in components:
+        if not nlp.has_pipe(component):
+            raise fastapi.HTTPException(
+                status_code=400,
+                detail=pipeline_error.format(message)
+            )
+
+
 class NERRequest(pydantic.BaseModel):
    sections: typing.List[str]
    sense2vec: bool = False
@@ -28,15 +41,11 @@ class NERRequest(pydantic.BaseModel):

 @app.post('/ner')
 async def recognize_named_entities(request: NERRequest):
-    if not nlp.has_pipe('ner') or not nlp.has_pipe('parser'):
-        raise fastapi.HTTPException(
-            status_code=400,
-            detail=pipeline_error.format('named entity recognition')
-        )
-    if request.sense2vec and not nlp.has_pipe('sense2vec'):
-        raise fastapi.HTTPException(
-            status_code=400,
-            detail='There is no sense2vec model bundled with this service.'
+    enforce_components(['ner', 'parser'], 'named entity recognition')
+    if request.sense2vec:
+        enforce_components(
+            ['sense2vec'],
+            'There is no sense2vec model bundled with this service.'
        )
    response = {'data': []}
    for doc in nlp.pipe(request.sections, disable=['tagger']):
@@ -49,13 +58,31 @@ async def recognize_named_entities(request: NERRequest):
    return response


-def build_entity(ent, use_sense2vec):
+class SimilarPhrase(pydantic.BaseModel):
+    """Similar phrases computed by sense2vec."""
+
+    """The similar phrase."""
+    phrase: str
+    """The phrase's similarity in the range of 0-1."""
+    similarity: float
+
+
+def compute_phrases(ent) -> typing.List[SimilarPhrase]:
+    """Computes similar phrases for the entity (<ent>).
+
+    The entity must have already been processed by the ner, parser, and
+    sense2vec pipeline components.
+    """
    similar = []
-    if use_sense2vec and ent._.in_s2v:
+    if ent._.in_s2v:
        for data in ent._.s2v_most_similar():
            similar.append(
-                {'phrase': data[0][0], 'similarity': float(data[1])}
+                SimilarPhrase(phrase=data[0][0], similarity=float(data[1]))
            )
+    return similar
+
+
+def build_entity(ent: spacy, use_sense2vec: bool):
    return {
        'text': ent.text,
        'label': ent.label_,
@@ -65,22 +92,42 @@ def build_entity(ent, use_sense2vec):
        'start': ent.start,
        'end': ent.end,
        'text_with_ws': ent.text_with_ws,
-        'sense2vec': similar,
+        'sense2vec': compute_phrases(ent) if use_sense2vec else [],
    }


+class PhraseInSentence(pydantic.BaseModel):
+    """A <phrase> in a <sentence>."""
+
+    sentence: str
+    phrase: str
+
+    @pydantic.root_validator
+    def check_passwords_match(cls, values):
+        if values.get('phrase') not in values.get('sentence'):
+            raise fastapi.HTTPException(
+                status_code=400,
+                detail='phrase must be in sentence'
+            )
+        return values
+
+
+@app.post('/sense2vec')
+async def sense2vec(request: PhraseInSentence):
+    enforce_components(['ner', 'parser', 'sense2vec'], 'sense2vec')
+    doc = nlp(request.sentence, disable=['tagger'])
+    for ent in list(doc.sents)[0].ents:
+        if ent.text == request.phrase:
+            return {'sense2vec': compute_phrases(ent)}
+
+
 class TextModel(pydantic.BaseModel):
    text: str


 @app.post('/pos')
 async def tag_parts_of_speech(request: TextModel):
-    if (not nlp.has_pipe('ner') or not nlp.has_pipe('parser')
-            or not nlp.has_pipe('tagger')):
-        raise fastapi.HTTPException(
-            status_code=400,
-            detail=pipeline_error.format('part-of-speech tagging')
-        )
+    enforce_components(['ner', 'parser', 'tagger'], 'part-of-speech tagging')
    data = []
    doc = nlp(request.text, disable=['sense2vec'])
    for token in [build_token(token) for token in doc]:
@@ -146,11 +193,7 @@ async def tokenize(request: TextModel):

 @app.post('/sentencizer')
 async def sentencize(request: TextModel):
-    if not nlp.has_pipe('parser'):
-        raise fastapi.HTTPException(
-            status_code=400,
-            detail=pipeline_error.format('sentence segmentation')
-        )
+    enforce_components(['parser'], 'sentence segmentation')
    doc = nlp(request.text, disable=['tagger', 'ner', 'sense2vec'])
    return {'sentences': [sent.text for sent in doc.sents]}


--- a/src/outputs/compute_phrases.json
+++ b/src/outputs/compute_phrases.json
+[
+  {
+    "phrase": "Mark Zuckerberg",
+    "similarity": 0.850600004196167
+  },
+  {
+    "phrase": "Warren Buffet",
+    "similarity": 0.8501999974250793
+  },
+  {
+    "phrase": "Warren Buffett",
+    "similarity": 0.8375999927520752
+  },
+  {
+    "phrase": "bill gates",
+    "similarity": 0.8215000033378601
+  },
+  {
+    "phrase": "Steve Jobs",
+    "similarity": 0.8180999755859375
+  },
+  {
+    "phrase": "Zuckerberg",
+    "similarity": 0.8163999915122986
+  },
+  {
+    "phrase": "Elon Musk",
+    "similarity": 0.8140000104904175
+  },
+  {
+    "phrase": "Bill gates",
+    "similarity": 0.8119999766349792
+  },
+  {
+    "phrase": "billionaire",
+    "similarity": 0.8116999864578247
+  },
+  {
+    "phrase": "Elon Musk",
+    "similarity": 0.8011999726295471
+  }
+]
\ No newline at end of file
--- a/src/outputs/sense2vec.json
+++ b/src/outputs/sense2vec.json
+{
+  "sense2vec": [
+    {
+      "phrase": "Mark Zuckerberg",
+      "similarity": 0.850600004196167
+    },
+    {
+      "phrase": "Warren Buffet",
+      "similarity": 0.8501999974250793
+    },
+    {
+      "phrase": "Warren Buffett",
+      "similarity": 0.8375999927520752
+    },
+    {
+      "phrase": "bill gates",
+      "similarity": 0.8215000033378601
+    },
+    {
+      "phrase": "Steve Jobs",
+      "similarity": 0.8180999755859375
+    },
+    {
+      "phrase": "Zuckerberg",
+      "similarity": 0.8163999915122986
+    },
+    {
+      "phrase": "Elon Musk",
+      "similarity": 0.8140000104904175
+    },
+    {
+      "phrase": "Bill gates",
+      "similarity": 0.8119999766349792
+    },
+    {
+      "phrase": "billionaire",
+      "similarity": 0.8116999864578247
+    },
+    {
+      "phrase": "Elon Musk",
+      "similarity": 0.8011999726295471
+    }
+  ]
+}
\ No newline at end of file
--- a/src/test_main.py
+++ b/src/test_main.py
 import json

+import fastapi
 import main
+import pytest
 import starlette.testclient

 client = starlette.testclient.TestClient(main.app)
@@ -36,6 +38,26 @@ def test_ner_sense2vec_fail():
    fail('/ner', ner_sense2vec_body, 'sense2vec')


+def test_sense2vec_success():
+    body = {
+        'sentence': 'Bill Gates founded Microsoft in April 4, 1975.',
+        'phrase': 'Bill Gates'
+    }
+    response = client.post('/sense2vec', json=body)
+    assert response.status_code == 200
+    with open('src/outputs/sense2vec.json') as f:
+        assert response.json() == json.load(f)
+
+
+def test_sense2vec_fail():
+    response = client.post(
+        '/sense2vec',
+        json={'sentence': 'My name is John Doe.', 'phrase': 'Johnny Doe'}
+    )
+    assert response.status_code == 400
+    assert response.json()['detail'] == 'phrase must be in sentence'
+
+
 pos_body = {'text': 'Apple is looking at buying U.K. startup for $1 billion'}


@@ -84,3 +106,23 @@ def fail(endpoint, body, pipe):
        response = client.post(endpoint, json=body)
        assert response.status_code == 400
        assert 'detail' in response.json()
+
+
+def test_enforce_components():
+    with pytest.raises(fastapi.HTTPException):
+        component = 'nonexistent_component'
+        main.enforce_components([component], component)
+
+
+def test_compute_phrases():
+    sentence = 'Bill Gates founded Microsoft in April 4, 1975.'
+    doc = main.nlp(sentence, disable=['tagger'])
+    for ent in list(doc.sents)[0].ents:
+        if ent.text == 'Bill Gates':
+            with open('src/outputs/compute_phrases.json') as f:
+                assert main.compute_phrases(ent) == json.load(f)
+
+
+def test_phrase_in_sentence():
+    with pytest.raises(fastapi.HTTPException):
+        main.PhraseInSentence(sentence='My name is John.', phrase='Johnny')