beki commited on
Commit
a409919
1 Parent(s): 3aff0ba

Create spacy_recognizer.py

Browse files
Files changed (1) hide show
  1. spacy_recognizer.py +131 -0
spacy_recognizer.py ADDED
@@ -0,0 +1,131 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ from typing import Optional, List, Tuple, Set
3
+
4
+ from presidio_analyzer import (
5
+ RecognizerResult,
6
+ LocalRecognizer,
7
+ AnalysisExplanation,
8
+ )
9
+ from presidio_analyzer.nlp_engine import NlpArtifacts
10
+ from presidio_analyzer.predefined_recognizers.spacy_recognizer import SpacyRecognizer
11
+
12
+ logger = logging.getLogger("presidio-analyzer")
13
+
14
+
15
+ class CustomSpacyRecognizer(LocalRecognizer):
16
+
17
+ ENTITIES = [
18
+ "LOCATION",
19
+ "PERSON",
20
+ "NRP",
21
+ "ORGANIZATION",
22
+ "DATE_TIME",
23
+ ]
24
+
25
+ DEFAULT_EXPLANATION = "Identified as {} by Spacy's Named Entity Recognition (Privy-trained)"
26
+
27
+ CHECK_LABEL_GROUPS = [
28
+ ({"LOCATION"}, {"LOC", "LOCATION", "STREET_ADDRESS", "COORDINATE"}),
29
+ ({"PERSON"}, {"PER", "PERSON"}),
30
+ ({"NRP"}, {"NORP", "NRP"}),
31
+ ({"ORGANIZATION"}, {"ORG"}),
32
+ ({"DATE_TIME"}, {"DATE_TIME"}),
33
+ ]
34
+
35
+ MODEL_LANGUAGES = {
36
+ "en": "beki/en_spacy_pii_distilbert",
37
+ }
38
+
39
+ PRESIDIO_EQUIVALENCES = {
40
+ "PER": "PERSON",
41
+ "LOC": "LOCATION",
42
+ "ORG": "ORGANIZATION",
43
+ "NROP": "NRP",
44
+ "DATE_TIME": "DATE_TIME",
45
+ }
46
+
47
+ def __init__(
48
+ self,
49
+ supported_language: str = "en",
50
+ supported_entities: Optional[List[str]] = None,
51
+ check_label_groups: Optional[Tuple[Set, Set]] = None,
52
+ context: Optional[List[str]] = None,
53
+ ner_strength: float = 0.85,
54
+ ):
55
+ self.ner_strength = ner_strength
56
+ self.check_label_groups = (
57
+ check_label_groups if check_label_groups else self.CHECK_LABEL_GROUPS
58
+ )
59
+ supported_entities = supported_entities if supported_entities else self.ENTITIES
60
+ super().__init__(
61
+ supported_entities=supported_entities,
62
+ supported_language=supported_language,
63
+ )
64
+
65
+ def load(self) -> None:
66
+ """Load the model, not used. Model is loaded during initialization."""
67
+ pass
68
+
69
+ def get_supported_entities(self) -> List[str]:
70
+ """
71
+ Return supported entities by this model.
72
+ :return: List of the supported entities.
73
+ """
74
+ return self.supported_entities
75
+
76
+ def build_spacy_explanation(
77
+ self, original_score: float, explanation: str
78
+ ) -> AnalysisExplanation:
79
+ """
80
+ Create explanation for why this result was detected.
81
+ :param original_score: Score given by this recognizer
82
+ :param explanation: Explanation string
83
+ :return:
84
+ """
85
+ explanation = AnalysisExplanation(
86
+ recognizer=self.__class__.__name__,
87
+ original_score=original_score,
88
+ textual_explanation=explanation,
89
+ )
90
+ return explanation
91
+
92
+ def analyze(self, text, entities, nlp_artifacts=None): # noqa D102
93
+ results = []
94
+ if not nlp_artifacts:
95
+ logger.warning("Skipping SpaCy, nlp artifacts not provided...")
96
+ return results
97
+
98
+ ner_entities = nlp_artifacts.entities
99
+
100
+ for entity in entities:
101
+ if entity not in self.supported_entities:
102
+ continue
103
+ for ent in ner_entities:
104
+ if not self.__check_label(entity, ent.label_, self.check_label_groups):
105
+ continue
106
+ textual_explanation = self.DEFAULT_EXPLANATION.format(
107
+ ent.label_)
108
+ explanation = self.build_spacy_explanation(
109
+ self.ner_strength, textual_explanation
110
+ )
111
+ spacy_result = RecognizerResult(
112
+ entity_type=entity,
113
+ start=ent.start_char,
114
+ end=ent.end_char,
115
+ score=self.ner_strength,
116
+ analysis_explanation=explanation,
117
+ recognition_metadata={
118
+ RecognizerResult.RECOGNIZER_NAME_KEY: self.name
119
+ },
120
+ )
121
+ results.append(spacy_result)
122
+
123
+ return results
124
+
125
+ @staticmethod
126
+ def __check_label(
127
+ entity: str, label: str, check_label_groups: Tuple[Set, Set]
128
+ ) -> bool:
129
+ return any(
130
+ [entity in egrp and label in lgrp for egrp, lgrp in check_label_groups]
131
+ )