# pattern for: 1 American rd, Dearborn, MI 48126, United States
# pattenr for: 1 American High way, South Dakota 48126, United States
# pattern for: 1 American rd, Dearborn, MI, United States
# pattern for: 717 N 2ND ST, MANKATO, MN 56001 US
ADDRESS_PATTERN_1 = [{"label":"ADDRESS",
"pattern":[{"ENT_TYPE":"POTENTIAL_DOOR_NUM"},
{"TEXT":{"REGEX":"\\W{1,2}"},"OP":"?"},
{"ENT_TYPE":"POTENTIAL_STREET_NAME","OP":"+"},
{"TEXT":{"REGEX":"\\W{1,2}"},"OP":"?"},
{"ENT_TYPE":"POTENTIAL_CITY_OR_STATE_OR_COUNTRY_NAME","OP":"*"},
{"TEXT":{"REGEX":"\\W{1,2}"},"OP":"?"},
{"ENT_TYPE":"POTENTIAL_CITY_OR_STATE_OR_COUNTRY_NAME","OP":"*"},
{"TEXT":{"REGEX":"\\W{1,2}"},"OP":"?"},
{"ENT_TYPE":"ZIP_CODE","OP":"*"},
{"TEXT":{"REGEX":"\\W{1,2}"},"OP":"?"},
{"ENT_TYPE":"POTENTIAL_CITY_OR_STATE_OR_COUNTRY_NAME","OP":"+"},
]
},
]
# pattern for: one American Road Dearborb Michigan 48126 United States
ADDRESS_PATTERN_2 = [{"label":"ADDRESS",
"pattern":[{"ENT_TYPE":"POTENTIAL_DOOR_NUM"},
{"TEXT":{"REGEX":"\\W{1,2}"},"OP":"?"},
{"ENT_TYPE":"POTENTIAL_CITY_OR_STATE_OR_COUNTRY_NAME","OP":"+"},
{"TEXT":{"REGEX":"\\W{1,2}"},"OP":"?"},
{"ENT_TYPE":"ZIP_CODE","OP":"*"},
{"TEXT":{"REGEX":"\\W{1,2}"},"OP":"?"},
{"ENT_TYPE":"POTENTIAL_CITY_OR_STATE_OR_COUNTRY_NAME","OP":"+"},
]
},
]
# 717 N 2ND ST, MANKATO, MN, 56001
# 717 N 2ND ST MANKATO MN 56001
ADDRESS_PATTERN_3 = [{"label": "ADDRESS",
"pattern":[{"ENT_TYPE":"POTENTIAL_DOOR_NUM"},
{"TEXT":{"REGEX":"\\W{1,2}"},"OP":"?"},
{"ENT_TYPE":"POTENTIAL_STREET_NAME","OP":"+"},
{"TEXT":{"REGEX":"\\W{1,2}"},"OP":"?"},
{"ENT_TYPE":"POTENTIAL_CITY_OR_STATE_OR_COUNTRY_NAME","OP":"+"},
{"TEXT":{"REGEX":"\\W{1,2}"},"OP":"?"},
{"ENT_TYPE":"ZIP_CODE","OP":"+"},
]
}
]
# Chennai, TamilNadu
# Dearborn Michigan
ADDRESS_PATTERN_4 = [{"label": "ADDRESS",
"pattern":[{"ENT_TYPE":"POTENTIAL_CITY_OR_STATE_OR_COUNTRY_NAME","OP":"+"},
{"TEXT":{"REGEX":"\\W{1,2}"},"OP":"?"},
]
}
]
# PO Box 107050, Albany, NY 12201-7050
# PO Box 107050, Albany, NY 12201
# PO Box 107050, Albany, NY, US 12201
ADDRESS_PATTERN_5 = [{"label": "ADDRESS",
"pattern":[{"ENT_TYPE":"P_O_BOX","OP":"+"},
{"TEXT":{"REGEX":"\\W{1,2}"},"OP":"?"},
{"ENT_TYPE":"POTENTIAL_CITY_OR_STATE_OR_COUNTRY_NAME","OP":"+"},
{"TEXT":{"REGEX":"\\W{1,2}"},"OP":"?"},
{"ENT_TYPE":"POTENTIAL_CITY_OR_STATE_OR_COUNTRY_NAME","OP":"+"},
{"TEXT":{"REGEX":"\\W{1,2}"},"OP":"?"},
{"ENT_TYPE":"POTENTIAL_CITY_OR_STATE_OR_COUNTRY_NAME","OP":"?"},
{"TEXT":{"REGEX":"\\W{1,2}"},"OP":"?"},
{"ENT_TYPE":"ZIP_CODE","OP":"+"},
]
}
]
second_degree_address_patterns = ADDRESS_PATTERN_1 + ADDRESS_PATTERN_2 + ADDRESS_PATTERN_3 + ADDRESS_PATTERN_4 + ADDRESS_PATTERN_5
def create_second_degree_address_nlp(first_degree_address_patterns,
second_degree_address_patterns
):
second_degree_address_nlp = spacy.load('en_core_web_sm', disable=['ner'])
rules_config = {"validate": True,
"overwrite_ents": True,
}
first_degree_rules = second_degree_address_nlp.add_pipe("entity_ruler",
"first_degree_rules",
config=rules_config)
first_degree_rules.add_patterns(first_degree_address_patterns)
second_degree_rules = second_degree_address_nlp.add_pipe("entity_ruler",
"second_degree_rules",
config=rules_config,
after='first_degree_rules')
second_degree_rules.add_patterns(second_degree_address_patterns)
return second_degree_address_nlp