import pandas as pd from typing import List, Optional, Dict, Tuple import os class AreaCodeTool: def __init__(self, csv_path: str = None): """ 初始化行政区划代码工具 Args: csv_path: CSV文件路径,如果为None则使用默认路径 """ if csv_path is None: # 获取当前文件所在目录 current_dir = os.path.dirname(os.path.abspath(__file__)) csv_path = os.path.join(current_dir, "area_code.csv") # 读取CSV文件 self.df = pd.read_csv(csv_path, dtype={'code': str}) # 确保code列为字符串类型 self.df['code'] = self.df['code'].astype(str) # 构建区域名称到代码的映射 self._build_name_maps() def _build_name_maps(self): """ 构建区域名称到代码的映射 目前读取excel表格就能获取省市县的映射,也可能市省、省市的映射 我们可以直接以省过滤,就能得到市级、市县的映射、 再根据市级过滤、得到县级映射 """ self.full_name_map = dict(zip(self.df['name'], self.df['code'])) # 构建二级映射 self.second_map = {} # 构建三级映射 self.third_map = {} for _, row in self.df.iterrows(): name = row['name'].strip() code = row['code'] parts = name.split('省' if '省' in name else '市') if '省' in name: if len(parts) > 1 and parts[1]: self.second_map[parts[1]] = code #考虑直辖县的情况 if '市' in name: city_parts = parts[1].split('市') if len(city_parts) > 1 and city_parts[1]: district = city_parts[1] self.third_map[district] = code if '自治州' in name: city_parts = parts[1].split('自治州') if len(city_parts) > 1 and city_parts[1]: district = city_parts[1] self.third_map[district] = code else: # 处理直辖市等特殊情况 if len(parts) > 1 and parts[1]: self.third_map[parts[1]] = code def find_code(self, area_name: str) -> List[Tuple[str, str]]: """ 查找区域代码 Args: area_name: 区域名称,可以是完整或部分名称 Returns: List[Tuple[str, str]]: 返回匹配的(区域名称, 代码)列表 """ results = [] # 尝试完整匹配 if area_name in self.full_name_map: results.append((area_name, self.full_name_map[area_name])) return results # 尝试二级匹配 if area_name.endswith('市') and area_name in self.second_map: results.append((area_name, self.second_map[area_name])) return results # 尝试三级级匹配 if area_name in self.third_map: results.append((area_name, self.third_map[area_name])) return results # 模糊匹配 if not results: for name in self.full_name_map.keys(): if self.is_subsequence(name,area_name): results.append((area_name, self.full_name_map[name])) return results # mask = self.df['name'].str.contains(area_name, na=False) # matches = self.df[mask] # results.extend([(row['name'], row['code']) for _, row in matches.iterrows()]) return results def get_full_name(self, code: str) -> Optional[str]: """ 根据代码获取完整的区域名称 Args: code: 区域代码 Returns: Optional[str]: 完整的区域名称,如果未找到则返回None """ mask = self.df['code'] == code matches = self.df[mask] if not matches.empty: return matches.iloc[0]['name'] return None def is_subsequence(self,source, target): # 初始化两个索引,分别指向源字符串和目标字符串的开头 source_index = 0 target_index = 0 # 遍历源字符串,直到找到目标字符串的所有字符或者遍历完源字符串 while source_index < len(source) and target_index < len(target): # 如果当前源字符串的字符等于目标字符串的字符,则移动目标字符串的索引 if source[source_index] == target[target_index]: target_index += 1 # 无论是否匹配,都移动源字符串的索引 source_index += 1 # 如果目标字符串的索引已经遍历完,说明找到了完整的子序列 return target_index == len(target) # 使用示例 def example_usage(): tool = AreaCodeTool() # 测试不同类型的查询 test_cases = [ "贵州省", "贵阳市", "云岩区", "贵州省贵阳市", "贵州省贵阳市南明区", "贵阳", # 模糊查询 "贵州贵阳" , # 模糊查询 "贵州贵阳南明", # 模糊查询 "贵州贵阳市南明" , # 模糊查询 ] for query in test_cases: print(f"\n查询: {query}") results = tool.find_code(query) for name, code in results: print(f"匹配结果: {name} -> {code}") # 测试代码反查 code = "340802" # 安徽省安庆市迎江区 full_name = tool.get_full_name(code) if full_name: print(f"\n代码反查: {code} -> {full_name}") if __name__ == "__main__": example_usage()