Commit d1bc8007 by 文靖昊

重写地区识别模糊匹配

parent 2a0c9663
...@@ -91,9 +91,14 @@ class AreaCodeTool: ...@@ -91,9 +91,14 @@ class AreaCodeTool:
# 模糊匹配 # 模糊匹配
if not results: if not results:
mask = self.df['name'].str.contains(area_name, na=False) for name in self.full_name_map.keys():
matches = self.df[mask] if self.is_subsequence(name,area_name):
results.extend([(row['name'], row['code']) for _, row in matches.iterrows()]) results.append((area_name, self.full_name_map[name]))
return results
# mask = self.df['name'].str.contains(area_name, na=False)
# matches = self.df[mask]
# results.extend([(row['name'], row['code']) for _, row in matches.iterrows()])
return results return results
...@@ -113,18 +118,37 @@ class AreaCodeTool: ...@@ -113,18 +118,37 @@ class AreaCodeTool:
return matches.iloc[0]['name'] return matches.iloc[0]['name']
return None return None
def is_subsequence(self,source, target):
# 初始化两个索引,分别指向源字符串和目标字符串的开头
source_index = 0
target_index = 0
# 遍历源字符串,直到找到目标字符串的所有字符或者遍历完源字符串
while source_index < len(source) and target_index < len(target):
# 如果当前源字符串的字符等于目标字符串的字符,则移动目标字符串的索引
if source[source_index] == target[target_index]:
target_index += 1
# 无论是否匹配,都移动源字符串的索引
source_index += 1
# 如果目标字符串的索引已经遍历完,说明找到了完整的子序列
return target_index == len(target)
# 使用示例 # 使用示例
def example_usage(): def example_usage():
tool = AreaCodeTool() tool = AreaCodeTool()
# 测试不同类型的查询 # 测试不同类型的查询
test_cases = [ test_cases = [
"安徽省", "贵州省",
"安庆市", "贵阳市",
"迎江区", "云岩区",
"安徽省安庆市", "贵州省贵阳市",
"安徽省安庆市迎江区", "贵州省贵阳市南明区",
"安庆" # 模糊查询 "贵阳", # 模糊查询
"贵州贵阳" , # 模糊查询
"贵州贵阳南明", # 模糊查询
"贵州贵阳市南明" , # 模糊查询
] ]
for query in test_cases: for query in test_cases:
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment