georesolver
825class PlaceResolver: 826 """ 827 A unified resolver that queries multiple geolocation services in order 828 and returns the first match with valid coordinates. 829 830 Args: 831 services (Optional[List[BaseQuery]]): List of geolocation service instances to use. 832 places_map_json (Union[str, None]): Path to a custom places mapping JSON file. 833 lang (Optional[str]): Language code for place type filtering. 834 threshold (float): Fuzzy matching threshold for place name similarity. 835 flexible_threshold (bool): If True, use a lower threshold for shorter place names. 836 flexible_threshold_value (float): The threshold value to use when flexible_threshold is True. 837 If no value is provided, it defaults to 70. 838 verbose (bool): If True, enable verbose logging. 839 840 """ 841 def __init__(self, 842 services: Optional[List[BaseQuery]] = None, 843 places_map_json: Union[str, None] = None, 844 lang: Optional[str] = None, 845 threshold: float = 90, 846 flexible_threshold: bool = False, 847 flexible_threshold_value: float = 70, 848 verbose: bool = False): 849 850 self.logger = setup_logger(self.__class__.__name__, verbose) 851 852 if services is None or not isinstance(services, list) or len(services) == 0: 853 services = [ 854 GeoNamesQuery(), 855 WHGQuery(), 856 TGNQuery(), 857 WikidataQuery() 858 ] 859 860 self.services = services 861 self.places_map = self._load_places_map(places_map_json) 862 self.lang = lang if lang else "en" 863 864 if not (0 <= threshold <= 100): 865 raise ValueError("threshold must be between 0 and 100") 866 867 self.threshold = threshold 868 869 self.flexible_threshold = flexible_threshold 870 if self.flexible_threshold: 871 if not (0 <= flexible_threshold_value <= 100): 872 raise ValueError("flexible_threshold_value must be between 0 and 100") 873 874 self.flexible_threshold_value = flexible_threshold_value 875 876 877 878 for service in self.services: 879 service.logger = setup_logger(service.__class__.__name__, verbose) 880 self.logger.debug(f"Updated logger for {service.__class__.__name__} with verbose={verbose}") 881 882 def _load_places_map(self, custom_path=None): 883 try: 884 if custom_path: 885 with open(custom_path, "r", encoding="utf-8") as f: 886 return json.load(f) 887 else: 888 resource_path = files("georesolver").joinpath("data/mappings/places_map.json") 889 with resource_path.open("r", encoding="utf-8") as f: 890 return json.load(f) 891 except Exception as e: 892 self.logger.error(f"Error loading places map: {e}") 893 return {} 894 895 896 def resolve(self, 897 place_name: str, 898 country_code: Union[str, None] = None, 899 place_type: Union[str, None] = None, 900 use_default_filter: bool = False) -> Union[dict, None]: 901 """ 902 Try resolving the place coordinates using multiple sources. 903 904 Args: 905 place_name (str): The place name to search 906 country_code (str): ISO 3166-1 alpha-2 country code (optional) 907 place_type (str): Place type (optional) 908 use_default_filter (bool): If True, apply a default filter as fallback in case the place_type is not found. 909 If no place_type is provided, no filtering will be applied. 910 911 Returns: 912 tuple: (lat, lon) or (None, None) if not found 913 """ 914 915 if not place_name or not isinstance(place_name, str): 916 self.logger.error("place_name must be a non-empty string") 917 return None 918 919 place_name = place_name.strip() 920 921 if pycountry.countries.get(alpha_2=country_code) is None and country_code is not None: 922 self.logger.warning(f"Invalid country code: {country_code}\nLook at the correct ISO 3166-1 alpha-2 country codes at https://www.iso.org/iso-3166-country-codes.html") 923 country_code = None 924 925 if self.flexible_threshold and len(place_name) < 5: 926 self.logger.warning( 927 f"Using flexible threshold for short place name: '{place_name}'" 928 ) 929 threshold = self.flexible_threshold_value 930 else: 931 threshold = self.threshold 932 933 for service in self.services: 934 try: 935 self.logger.info(f"Trying {service.__class__.__name__} for '{place_name}'") 936 mapper = PlaceTypeMapper(self.places_map) 937 service_key = service.__class__.__name__.lower().replace("query", "") 938 939 resolved_type = None 940 941 if place_type: 942 resolved_type = mapper.get_for_service(place_type, service_key) 943 if resolved_type is None and use_default_filter: 944 self.logger.warning( 945 f"Unrecognized place_type '{place_type}' for service '{service_key}', falling back to 'pueblo'." 946 ) 947 resolved_type = mapper.get_for_service("pueblo", service_key) 948 elif resolved_type is None: 949 self.logger.debug( 950 f"Skipping place_type filter for service '{service_key}' (unrecognized type: '{place_type}')." 951 ) 952 953 results = service.places_by_name(place_name, country_code, resolved_type, lang=self.lang) 954 result = service.get_best_match(results, place_name, fuzzy_threshold=threshold, lang=self.lang) 955 if result: 956 self.logger.info(f"Resolved '{place_name}' via {service.__class__.__name__}: {result}") 957 return result 958 except Exception as e: 959 traceback_str = traceback.format_exc() 960 self.logger.warning(f"{service.__class__.__name__} failed for '{place_name}': {e}\n{traceback_str}") 961 self.logger.warning(f"Could not resolve '{place_name}' via any service.") 962 return None 963 964 def resolve_batch( 965 self, 966 df: pd.DataFrame, 967 place_column: str = "place_name", 968 country_column: Union[str, None] = None, 969 place_type_column: Union[str, None] = None, 970 use_default_filter: bool = False, 971 return_df: bool = True, 972 show_progress: bool = True 973 ) -> Union[pd.DataFrame, List[dict]]: 974 """ 975 Resolve coordinates for a batch of places from a DataFrame. 976 977 Args: 978 df (pd.DataFrame): Input DataFrame with place names and optional country/type columns. 979 place_column (str): Column name for place names. 980 country_column (str): Column name for country codes (optional). 981 place_type_column (str): Column name for place types (optional). 982 return_df (bool): If True, return a DataFrame with separate columns for each attribute. Otherwise, return a list of dictionaries. 983 show_progress (bool): If True, show a progress bar during processing. 984 985 Raises: 986 ValueError: If the input DataFrame is not valid or required columns are missing. 987 988 Returns: 989 pd.DataFrame: A DataFrame with resolved coordinates and metadata. 990 List[dict]: A list of dictionaries with resolved coordinates and metadata if return_df is False. 991 """ 992 #TODO: 993 # - Gently handle NaN and empty strings in place_column 994 # - Process data in chunks of 100 rows 995 # - Only process records with valid place names (non-empty strings) 996 # - Sort Series 997 998 if not isinstance(df, pd.DataFrame): 999 raise ValueError("Input must be a pandas DataFrame") 1000 1001 if place_column not in df.columns: 1002 raise ValueError(f"Column '{place_column}' not found in DataFrame") 1003 1004 if country_column and country_column not in df.columns: 1005 raise ValueError(f"Column '{country_column}' not found in DataFrame") 1006 1007 if place_type_column and place_type_column not in df.columns: 1008 raise ValueError(f"Column '{place_type_column}' not found in DataFrame") 1009 1010 if show_progress: 1011 df_iter = tqdm(df.iterrows(), total=len(df)) 1012 else: 1013 df_iter = df.iterrows() 1014 1015 results = [] 1016 for _, row in df_iter: 1017 place_name = row.get(place_column, "") 1018 country_code = row.get(country_column) if country_column else None 1019 place_type = row.get(place_type_column) if place_type_column else None 1020 1021 coords = self.resolve( 1022 place_name=place_name, 1023 country_code=country_code, 1024 place_type=place_type, 1025 use_default_filter=use_default_filter 1026 ) 1027 1028 results.append(coords) 1029 1030 if return_df: 1031 return pd.DataFrame(results, columns=["place", "standardize_label", "language", "latitude", "longitude", "source", "place_id", "place_uri", "country_code", "part_of", "part_of_uri", "confidence", "threshold", "match_type"], index=df.index) 1032 else: 1033 return results
A unified resolver that queries multiple geolocation services in order and returns the first match with valid coordinates.
Args: services (Optional[List[BaseQuery]]): List of geolocation service instances to use. places_map_json (Union[str, None]): Path to a custom places mapping JSON file. lang (Optional[str]): Language code for place type filtering. threshold (float): Fuzzy matching threshold for place name similarity. flexible_threshold (bool): If True, use a lower threshold for shorter place names. flexible_threshold_value (float): The threshold value to use when flexible_threshold is True. If no value is provided, it defaults to 70. verbose (bool): If True, enable verbose logging.
841 def __init__(self, 842 services: Optional[List[BaseQuery]] = None, 843 places_map_json: Union[str, None] = None, 844 lang: Optional[str] = None, 845 threshold: float = 90, 846 flexible_threshold: bool = False, 847 flexible_threshold_value: float = 70, 848 verbose: bool = False): 849 850 self.logger = setup_logger(self.__class__.__name__, verbose) 851 852 if services is None or not isinstance(services, list) or len(services) == 0: 853 services = [ 854 GeoNamesQuery(), 855 WHGQuery(), 856 TGNQuery(), 857 WikidataQuery() 858 ] 859 860 self.services = services 861 self.places_map = self._load_places_map(places_map_json) 862 self.lang = lang if lang else "en" 863 864 if not (0 <= threshold <= 100): 865 raise ValueError("threshold must be between 0 and 100") 866 867 self.threshold = threshold 868 869 self.flexible_threshold = flexible_threshold 870 if self.flexible_threshold: 871 if not (0 <= flexible_threshold_value <= 100): 872 raise ValueError("flexible_threshold_value must be between 0 and 100") 873 874 self.flexible_threshold_value = flexible_threshold_value 875 876 877 878 for service in self.services: 879 service.logger = setup_logger(service.__class__.__name__, verbose) 880 self.logger.debug(f"Updated logger for {service.__class__.__name__} with verbose={verbose}")
896 def resolve(self, 897 place_name: str, 898 country_code: Union[str, None] = None, 899 place_type: Union[str, None] = None, 900 use_default_filter: bool = False) -> Union[dict, None]: 901 """ 902 Try resolving the place coordinates using multiple sources. 903 904 Args: 905 place_name (str): The place name to search 906 country_code (str): ISO 3166-1 alpha-2 country code (optional) 907 place_type (str): Place type (optional) 908 use_default_filter (bool): If True, apply a default filter as fallback in case the place_type is not found. 909 If no place_type is provided, no filtering will be applied. 910 911 Returns: 912 tuple: (lat, lon) or (None, None) if not found 913 """ 914 915 if not place_name or not isinstance(place_name, str): 916 self.logger.error("place_name must be a non-empty string") 917 return None 918 919 place_name = place_name.strip() 920 921 if pycountry.countries.get(alpha_2=country_code) is None and country_code is not None: 922 self.logger.warning(f"Invalid country code: {country_code}\nLook at the correct ISO 3166-1 alpha-2 country codes at https://www.iso.org/iso-3166-country-codes.html") 923 country_code = None 924 925 if self.flexible_threshold and len(place_name) < 5: 926 self.logger.warning( 927 f"Using flexible threshold for short place name: '{place_name}'" 928 ) 929 threshold = self.flexible_threshold_value 930 else: 931 threshold = self.threshold 932 933 for service in self.services: 934 try: 935 self.logger.info(f"Trying {service.__class__.__name__} for '{place_name}'") 936 mapper = PlaceTypeMapper(self.places_map) 937 service_key = service.__class__.__name__.lower().replace("query", "") 938 939 resolved_type = None 940 941 if place_type: 942 resolved_type = mapper.get_for_service(place_type, service_key) 943 if resolved_type is None and use_default_filter: 944 self.logger.warning( 945 f"Unrecognized place_type '{place_type}' for service '{service_key}', falling back to 'pueblo'." 946 ) 947 resolved_type = mapper.get_for_service("pueblo", service_key) 948 elif resolved_type is None: 949 self.logger.debug( 950 f"Skipping place_type filter for service '{service_key}' (unrecognized type: '{place_type}')." 951 ) 952 953 results = service.places_by_name(place_name, country_code, resolved_type, lang=self.lang) 954 result = service.get_best_match(results, place_name, fuzzy_threshold=threshold, lang=self.lang) 955 if result: 956 self.logger.info(f"Resolved '{place_name}' via {service.__class__.__name__}: {result}") 957 return result 958 except Exception as e: 959 traceback_str = traceback.format_exc() 960 self.logger.warning(f"{service.__class__.__name__} failed for '{place_name}': {e}\n{traceback_str}") 961 self.logger.warning(f"Could not resolve '{place_name}' via any service.") 962 return None
Try resolving the place coordinates using multiple sources.
Args: place_name (str): The place name to search country_code (str): ISO 3166-1 alpha-2 country code (optional) place_type (str): Place type (optional) use_default_filter (bool): If True, apply a default filter as fallback in case the place_type is not found. If no place_type is provided, no filtering will be applied.
Returns: tuple: (lat, lon) or (None, None) if not found
964 def resolve_batch( 965 self, 966 df: pd.DataFrame, 967 place_column: str = "place_name", 968 country_column: Union[str, None] = None, 969 place_type_column: Union[str, None] = None, 970 use_default_filter: bool = False, 971 return_df: bool = True, 972 show_progress: bool = True 973 ) -> Union[pd.DataFrame, List[dict]]: 974 """ 975 Resolve coordinates for a batch of places from a DataFrame. 976 977 Args: 978 df (pd.DataFrame): Input DataFrame with place names and optional country/type columns. 979 place_column (str): Column name for place names. 980 country_column (str): Column name for country codes (optional). 981 place_type_column (str): Column name for place types (optional). 982 return_df (bool): If True, return a DataFrame with separate columns for each attribute. Otherwise, return a list of dictionaries. 983 show_progress (bool): If True, show a progress bar during processing. 984 985 Raises: 986 ValueError: If the input DataFrame is not valid or required columns are missing. 987 988 Returns: 989 pd.DataFrame: A DataFrame with resolved coordinates and metadata. 990 List[dict]: A list of dictionaries with resolved coordinates and metadata if return_df is False. 991 """ 992 #TODO: 993 # - Gently handle NaN and empty strings in place_column 994 # - Process data in chunks of 100 rows 995 # - Only process records with valid place names (non-empty strings) 996 # - Sort Series 997 998 if not isinstance(df, pd.DataFrame): 999 raise ValueError("Input must be a pandas DataFrame") 1000 1001 if place_column not in df.columns: 1002 raise ValueError(f"Column '{place_column}' not found in DataFrame") 1003 1004 if country_column and country_column not in df.columns: 1005 raise ValueError(f"Column '{country_column}' not found in DataFrame") 1006 1007 if place_type_column and place_type_column not in df.columns: 1008 raise ValueError(f"Column '{place_type_column}' not found in DataFrame") 1009 1010 if show_progress: 1011 df_iter = tqdm(df.iterrows(), total=len(df)) 1012 else: 1013 df_iter = df.iterrows() 1014 1015 results = [] 1016 for _, row in df_iter: 1017 place_name = row.get(place_column, "") 1018 country_code = row.get(country_column) if country_column else None 1019 place_type = row.get(place_type_column) if place_type_column else None 1020 1021 coords = self.resolve( 1022 place_name=place_name, 1023 country_code=country_code, 1024 place_type=place_type, 1025 use_default_filter=use_default_filter 1026 ) 1027 1028 results.append(coords) 1029 1030 if return_df: 1031 return pd.DataFrame(results, columns=["place", "standardize_label", "language", "latitude", "longitude", "source", "place_id", "place_uri", "country_code", "part_of", "part_of_uri", "confidence", "threshold", "match_type"], index=df.index) 1032 else: 1033 return results
Resolve coordinates for a batch of places from a DataFrame.
Args: df (pd.DataFrame): Input DataFrame with place names and optional country/type columns. place_column (str): Column name for place names. country_column (str): Column name for country codes (optional). place_type_column (str): Column name for place types (optional). return_df (bool): If True, return a DataFrame with separate columns for each attribute. Otherwise, return a list of dictionaries. show_progress (bool): If True, show a progress bar during processing.
Raises: ValueError: If the input DataFrame is not valid or required columns are missing.
Returns: pd.DataFrame: A DataFrame with resolved coordinates and metadata. List[dict]: A list of dictionaries with resolved coordinates and metadata if return_df is False.
40class GeoNamesQuery(BaseQuery): 41 """ 42 A class to interact with the GeoNames API. 43 44 This class provides methods to search and retrieve geographic coordinates for places 45 using the GeoNames API. It supports filtering by country and feature class. 46 47 Attributes: 48 endpoint (str): The base URL for the GeoNames API 49 username (str): GeoNames API username for authentication 50 51 Example: 52 >>> geonames = GeoNamesQuery("http://api.geonames.org", username="your_username") 53 >>> results = geonames.places_by_name("Madrid", country="ES") 54 >>> coordinates = geonames.get_best_match(results, "Madrid") 55 """ 56 def __init__(self, geonames_username: Union[str, None] = None): 57 super().__init__(base_url=GEONAMES_ENDPOINT) 58 if geonames_username: 59 self.username = geonames_username 60 else: 61 self.username = os.getenv("GEONAMES_USERNAME") 62 if not self.username: 63 raise ValueError("GeoNames username must be provided either as an argument or via the GEONAMES_USERNAME environment variable.") 64 65 def places_by_name(self, place_name: str, country_code: Optional[str], place_type: Optional[str] = None, lang: Optional[str] = None) -> dict: 66 """ 67 Search for places using the GeoNames API. 68 69 Parameters: 70 place_name (str): Name of the place to search for 71 country_code (str): Optional ISO 3166-1 alpha-2 country code 72 place_type (str): Optional feature class (A: country, P: city/village, etc.). 73 Additional types can be added in the data/mappings/geonames_place_map.json file. 74 """ 75 76 params = { 77 'q': place_name, 78 'username': self.username, 79 'maxRows': 10, 80 'type': 'json', 81 'style': 'FULL' 82 } 83 84 if country_code: 85 params['country'] = country_code 86 87 if place_type: 88 params['featureClass'] = place_type.lower() 89 90 try: 91 response = self._limited_get( 92 "/searchJSON", 93 params=params 94 ) 95 return response.json() 96 except Exception as e: 97 self.logger.error(f"Error querying GeoNames for '{place_name}': {str(e)}") 98 return {"geonames": []} 99 100 def _post_filtering( 101 self, 102 results: dict, 103 place_name: str, 104 fuzzy_threshold: float, 105 confidence: float, 106 lang: Optional[str] = "en") -> dict: 107 """ 108 Returns the dictionary customized to the GeoNames API results. 109 """ 110 111 standardize_label = "" 112 113 if lang: 114 self.logger.info(f"Post-filtering GeoNames results for '{place_name}' with language '{lang}'") 115 116 standardize_label = next((name for name in results.get("alternateNames", []) if name["lang"] == lang), {}).get("name", "") 117 118 if not standardize_label: 119 standardize_label = results["toponymName"] 120 121 return { 122 "place": place_name, 123 "standardize_label": standardize_label, 124 "language": lang, 125 "latitude": float(results["lat"]), 126 "longitude": float(results["lng"]), 127 "source": "GeoNames", 128 "id": results["geonameId"], 129 "uri": f"http://sws.geonames.org/{results['geonameId']}/", 130 "country_code": results.get("countryCode", ""), 131 "part_of": "", 132 "part_of_uri": "", 133 "confidence": confidence, 134 "threshold": fuzzy_threshold, 135 "match_type": "exact" if confidence == 100 else "fuzzy" 136 } 137 138 139 def get_best_match(self, results: Union[dict, list], place_name: str, fuzzy_threshold: float, lang: Optional[str] = None) -> Union[dict, None]: 140 """ 141 Get the best matching place from the results based on name similarity. 142 143 Parameters: 144 results (Union[dict, list]): Results from places_by_name query 145 place_name (str): Original place name to match against 146 fuzzy_threshold (float): Minimum similarity score (0-100) for a match 147 148 Returns: 149 dictionary: A dictionary containing { 150 "place": str, "standardize_label": str, "latitude": float, "longitude": float, "source": "GeoNames", 151 "id": str, "uri": str, "country_code": str, "confidence": float, "threshold": fuzzy_threshold, 152 "match_type": str 153 } 154 """ 155 if not isinstance(results, dict) or not results.get("geonames"): 156 return None 157 158 geonames = results["geonames"] 159 if len(geonames) == 1: 160 result = geonames[0] 161 return self._post_filtering(result, place_name, fuzzy_threshold, 100, lang) 162 163 best_ratio = 0 164 best_coords = None 165 166 for place in geonames: 167 name = place.get("name", "") 168 alternate_names = place.get("alternateNames", []) 169 all_names = [name] + [n.get("name", "") for n in alternate_names] 170 171 for n in all_names: 172 partial_ratio = fuzz.partial_ratio(place_name.lower(), n.lower()) 173 regular_ratio = fuzz.ratio(place_name.lower(), n.lower()) 174 ratio = max(partial_ratio, regular_ratio) 175 176 if ratio > best_ratio: 177 best_ratio = ratio 178 best_coords = self._post_filtering(place, place_name, fuzzy_threshold, ratio, lang) 179 self.logger.info(f"Found match: '{name}' with similarity {ratio}%") 180 181 if best_ratio >= fuzzy_threshold: 182 return best_coords 183 184 return None
A class to interact with the GeoNames API.
This class provides methods to search and retrieve geographic coordinates for places using the GeoNames API. It supports filtering by country and feature class.
Attributes: endpoint (str): The base URL for the GeoNames API username (str): GeoNames API username for authentication
Example:
geonames = GeoNamesQuery("http://api.geonames.org", username="your_username") results = geonames.places_by_name("Madrid", country="ES") coordinates = geonames.get_best_match(results, "Madrid")
56 def __init__(self, geonames_username: Union[str, None] = None): 57 super().__init__(base_url=GEONAMES_ENDPOINT) 58 if geonames_username: 59 self.username = geonames_username 60 else: 61 self.username = os.getenv("GEONAMES_USERNAME") 62 if not self.username: 63 raise ValueError("GeoNames username must be provided either as an argument or via the GEONAMES_USERNAME environment variable.")
65 def places_by_name(self, place_name: str, country_code: Optional[str], place_type: Optional[str] = None, lang: Optional[str] = None) -> dict: 66 """ 67 Search for places using the GeoNames API. 68 69 Parameters: 70 place_name (str): Name of the place to search for 71 country_code (str): Optional ISO 3166-1 alpha-2 country code 72 place_type (str): Optional feature class (A: country, P: city/village, etc.). 73 Additional types can be added in the data/mappings/geonames_place_map.json file. 74 """ 75 76 params = { 77 'q': place_name, 78 'username': self.username, 79 'maxRows': 10, 80 'type': 'json', 81 'style': 'FULL' 82 } 83 84 if country_code: 85 params['country'] = country_code 86 87 if place_type: 88 params['featureClass'] = place_type.lower() 89 90 try: 91 response = self._limited_get( 92 "/searchJSON", 93 params=params 94 ) 95 return response.json() 96 except Exception as e: 97 self.logger.error(f"Error querying GeoNames for '{place_name}': {str(e)}") 98 return {"geonames": []}
Search for places using the GeoNames API.
Parameters: place_name (str): Name of the place to search for country_code (str): Optional ISO 3166-1 alpha-2 country code place_type (str): Optional feature class (A: country, P: city/village, etc.). Additional types can be added in the data/mappings/geonames_place_map.json file.
139 def get_best_match(self, results: Union[dict, list], place_name: str, fuzzy_threshold: float, lang: Optional[str] = None) -> Union[dict, None]: 140 """ 141 Get the best matching place from the results based on name similarity. 142 143 Parameters: 144 results (Union[dict, list]): Results from places_by_name query 145 place_name (str): Original place name to match against 146 fuzzy_threshold (float): Minimum similarity score (0-100) for a match 147 148 Returns: 149 dictionary: A dictionary containing { 150 "place": str, "standardize_label": str, "latitude": float, "longitude": float, "source": "GeoNames", 151 "id": str, "uri": str, "country_code": str, "confidence": float, "threshold": fuzzy_threshold, 152 "match_type": str 153 } 154 """ 155 if not isinstance(results, dict) or not results.get("geonames"): 156 return None 157 158 geonames = results["geonames"] 159 if len(geonames) == 1: 160 result = geonames[0] 161 return self._post_filtering(result, place_name, fuzzy_threshold, 100, lang) 162 163 best_ratio = 0 164 best_coords = None 165 166 for place in geonames: 167 name = place.get("name", "") 168 alternate_names = place.get("alternateNames", []) 169 all_names = [name] + [n.get("name", "") for n in alternate_names] 170 171 for n in all_names: 172 partial_ratio = fuzz.partial_ratio(place_name.lower(), n.lower()) 173 regular_ratio = fuzz.ratio(place_name.lower(), n.lower()) 174 ratio = max(partial_ratio, regular_ratio) 175 176 if ratio > best_ratio: 177 best_ratio = ratio 178 best_coords = self._post_filtering(place, place_name, fuzzy_threshold, ratio, lang) 179 self.logger.info(f"Found match: '{name}' with similarity {ratio}%") 180 181 if best_ratio >= fuzzy_threshold: 182 return best_coords 183 184 return None
Get the best matching place from the results based on name similarity.
Parameters: results (Union[dict, list]): Results from places_by_name query place_name (str): Original place name to match against fuzzy_threshold (float): Minimum similarity score (0-100) for a match
Returns: dictionary: A dictionary containing { "place": str, "standardize_label": str, "latitude": float, "longitude": float, "source": "GeoNames", "id": str, "uri": str, "country_code": str, "confidence": float, "threshold": fuzzy_threshold, "match_type": str }
186class WHGQuery(BaseQuery): 187 """ 188 A class to interact with the World Historical Gazetteer (WHG) API. 189 190 This class provides methods to search and retrieve geographic coordinates for historical 191 places using the WHG API. It supports filtering by country code and feature class, 192 and includes functionality to find the best matching place from multiple results. 193 194 Attributes: 195 endpoint (str): The base URL for the WHG API 196 search_domain (str): The API endpoint path for searches. Default is "/index" 197 collection (str): The WHG collection to search in (default: "") 198 199 Example: 200 >>> whg = WHGQuery("https://whgazetteer.org/api") 201 >>> results = whg.places_by_name("Cuicatlán", country_code="MX", place_type="p") 202 >>> coordinates = whg.get_best_match(results, place_type="pueblo", country_code="MX") 203 """ 204 def __init__(self, 205 search_domain: str = "index", 206 dataset: str = ""): 207 super().__init__(base_url=WHG_ENDPOINT) 208 self.dataset = dataset 209 self.search_domain = search_domain 210 211 @sleep_and_retry 212 @limits(calls=5, period=1) # There's no official rate limit for WHG, but we set a conservative limit 213 def places_by_name(self, 214 place_name: str, 215 country_code: Optional[str], 216 place_type: Optional[str] = "p", 217 lang: Optional[str] = None) -> Union[dict, list]: 218 """ 219 Search for place using the World Historical Gazetteer API https://docs.whgazetteer.org/content/400-Technical.html#api 220 221 Parameters: 222 place_name (str): Any string with the name of the place. This keyword includes place names variants. 223 country_code (str): ISO 3166-1 alpha-2 country code. 224 place_type (str): Feature class according to Linked Places Format. Default is 'p' for place. Look at https://github.com/LinkedPasts/linked-places-format for more places classes. 225 """ 226 227 if not place_type: 228 self.logger.debug("No place_type provided, defaulting to 'p' for place type.") 229 place_type = "p" 230 231 # Build URL with optional country code 232 url = f"{self.base_url}/{self.search_domain}/?name={place_name}&fclass={place_type}&dataset={self.dataset}" 233 if country_code: 234 url += f"&ccodes={country_code}" 235 236 try: 237 response = self._limited_get(url) 238 results = response.json() 239 if country_code: 240 return self._post_filtering_search(results, country_code=country_code) 241 return results 242 except requests.exceptions.RequestException as e: 243 self.logger.error(f"Request error searching for '{place_name}': {str(e)}") 244 return {"features": []} 245 except ValueError as e: 246 self.logger.error(f"Invalid JSON response for '{place_name}': {str(e)}") 247 return {"features": []} 248 249 250 def get_best_match(self, 251 results: Union[dict, list], 252 place_name: str, 253 fuzzy_threshold: float, 254 lang: Optional[str] = None) -> Union[dict, None]: 255 256 self.logger.info(f"Finding best match for '{place_name}' in WHG results") 257 self.logger.debug(f"Results: {results}") 258 259 try: 260 features = results.get("features", []) if isinstance(results, dict) else [] 261 if not features: 262 return None 263 264 for r in features: 265 name = r.get("properties", {}).get("title", "") 266 if not name: 267 continue 268 269 ratio = fuzz.ratio(name.lower(), place_name.lower()) 270 self.logger.info(f"Comparing '{name}' with '{place_name}': {ratio}% similarity") 271 if ratio >= fuzzy_threshold: 272 return self._post_filtering( 273 results=r, 274 place_name=place_name, 275 fuzzy_threshold=fuzzy_threshold, 276 confidence=ratio, 277 lang=lang 278 ) 279 280 return None 281 282 except Exception as e: 283 self.logger.error(f"Error processing results: {str(e)}") 284 return None 285 286 def _post_filtering_search( 287 self, 288 results: dict, 289 country_code: Optional[str] = None 290) -> dict: 291 """ 292 Post-process the WHG API results to filter by country code. This extra step is necessary 293 because the WHG API does a soft filtering by country code, but it does not guarantee that 294 all results will match the provided country code. 295 """ 296 if not results.get("features"): 297 return {"features": []} 298 299 filtered = [] 300 for feature in results["features"]: 301 props = feature.get("properties", {}) 302 ccodes = props.get("ccodes", []) 303 if len(ccodes) == 0: 304 ccodes = feature.get("ccodes", []) 305 306 # Check country code 307 if country_code and country_code.upper() not in ccodes: 308 continue 309 310 filtered.append(feature) 311 312 return {"features": filtered} 313 314 def get_coordinates_lod_json(self, geometry: dict, place_name: str) -> Union[list, None]: 315 """ 316 Extracts geographic coordinates from the WHG API response. 317 """ 318 319 if geometry.get("type") == "GeometryCollection": 320 self.logger.warning(f"Best match for '{place_name}' is a GeometryCollection. Taking the first valid point.") 321 322 coordinates = None 323 for geom in geometry.get("geometries", []): 324 if geom.get("type") == "Point": 325 coordinates = geom.get("coordinates") 326 break 327 if not coordinates: 328 self.logger.warning(f"No valid Point found in GeometryCollection for '{place_name}'.") 329 return None 330 else: 331 return geometry.get("coordinates", []) 332 333 def _post_filtering( 334 self, 335 results: dict, 336 place_name: str, 337 fuzzy_threshold: float, 338 confidence: float, 339 lang: Optional[str] = "en") -> Union[dict, None]: 340 """ 341 Returns the dictionary customized to the WHG API results. 342 """ 343 self.logger.debug(f"Post-filtering WHG results for '{place_name}' with language '{lang}'\n{results}") 344 345 geometry = results.get("geometry", {}) 346 coordinates = self.get_coordinates_lod_json(geometry, place_name) 347 if coordinates and len(coordinates) == 2: 348 name = results.get("properties", {}).get("title", "") 349 self.logger.info(f"Best match for '{place_name}': {name} ({confidence}%)") 350 return { 351 "place": place_name, 352 "standardize_label": name, 353 "language": lang, 354 "latitude": float(coordinates[1]), 355 "longitude": float(coordinates[0]), 356 "source": "WHG", 357 "id": results.get("properties", {}).get("index_id", ""), 358 "uri": f"https://whgazetteer.org/places/{results.get('properties', {}).get('index_id', '')}/portal/", 359 "country_code": results.get("properties", {}).get("ccodes", [])[0] if results.get("properties", {}).get("ccodes") else "", 360 "part_of": "", 361 "part_of_uri": "", 362 "confidence": confidence, 363 "threshold": fuzzy_threshold, 364 "match_type": "exact" if confidence == 100 else "fuzzy" 365 }
A class to interact with the World Historical Gazetteer (WHG) API.
This class provides methods to search and retrieve geographic coordinates for historical places using the WHG API. It supports filtering by country code and feature class, and includes functionality to find the best matching place from multiple results.
Attributes: endpoint (str): The base URL for the WHG API search_domain (str): The API endpoint path for searches. Default is "/index" collection (str): The WHG collection to search in (default: "")
Example:
whg = WHGQuery("https://whgazetteer.org/api") results = whg.places_by_name("Cuicatlán", country_code="MX", place_type="p") coordinates = whg.get_best_match(results, place_type="pueblo", country_code="MX")
211 @sleep_and_retry 212 @limits(calls=5, period=1) # There's no official rate limit for WHG, but we set a conservative limit 213 def places_by_name(self, 214 place_name: str, 215 country_code: Optional[str], 216 place_type: Optional[str] = "p", 217 lang: Optional[str] = None) -> Union[dict, list]: 218 """ 219 Search for place using the World Historical Gazetteer API https://docs.whgazetteer.org/content/400-Technical.html#api 220 221 Parameters: 222 place_name (str): Any string with the name of the place. This keyword includes place names variants. 223 country_code (str): ISO 3166-1 alpha-2 country code. 224 place_type (str): Feature class according to Linked Places Format. Default is 'p' for place. Look at https://github.com/LinkedPasts/linked-places-format for more places classes. 225 """ 226 227 if not place_type: 228 self.logger.debug("No place_type provided, defaulting to 'p' for place type.") 229 place_type = "p" 230 231 # Build URL with optional country code 232 url = f"{self.base_url}/{self.search_domain}/?name={place_name}&fclass={place_type}&dataset={self.dataset}" 233 if country_code: 234 url += f"&ccodes={country_code}" 235 236 try: 237 response = self._limited_get(url) 238 results = response.json() 239 if country_code: 240 return self._post_filtering_search(results, country_code=country_code) 241 return results 242 except requests.exceptions.RequestException as e: 243 self.logger.error(f"Request error searching for '{place_name}': {str(e)}") 244 return {"features": []} 245 except ValueError as e: 246 self.logger.error(f"Invalid JSON response for '{place_name}': {str(e)}") 247 return {"features": []}
Search for place using the World Historical Gazetteer API https://docs.whgazetteer.org/content/400-Technical.html#api
Parameters: place_name (str): Any string with the name of the place. This keyword includes place names variants. country_code (str): ISO 3166-1 alpha-2 country code. place_type (str): Feature class according to Linked Places Format. Default is 'p' for place. Look at https://github.com/LinkedPasts/linked-places-format for more places classes.
250 def get_best_match(self, 251 results: Union[dict, list], 252 place_name: str, 253 fuzzy_threshold: float, 254 lang: Optional[str] = None) -> Union[dict, None]: 255 256 self.logger.info(f"Finding best match for '{place_name}' in WHG results") 257 self.logger.debug(f"Results: {results}") 258 259 try: 260 features = results.get("features", []) if isinstance(results, dict) else [] 261 if not features: 262 return None 263 264 for r in features: 265 name = r.get("properties", {}).get("title", "") 266 if not name: 267 continue 268 269 ratio = fuzz.ratio(name.lower(), place_name.lower()) 270 self.logger.info(f"Comparing '{name}' with '{place_name}': {ratio}% similarity") 271 if ratio >= fuzzy_threshold: 272 return self._post_filtering( 273 results=r, 274 place_name=place_name, 275 fuzzy_threshold=fuzzy_threshold, 276 confidence=ratio, 277 lang=lang 278 ) 279 280 return None 281 282 except Exception as e: 283 self.logger.error(f"Error processing results: {str(e)}") 284 return None
Get the best matching place from the results based on name similarity.
Parameters: results (Union[dict, list]): Results from places_by_name query place_name (str): Original place name to match against fuzzy_threshold (float): Minimum similarity score (0-100) for a match lang (Optional[str]): Language code for place type
Returns: dictionary: A dictionary containing { "place": place_name, "standardize_label": str, "latitude": float, "longitude": float, "source": str, "id": str, "uri": str, "country_code": str, "confidence": float, "threshold": fuzzy_threshold, "match_type": str }
314 def get_coordinates_lod_json(self, geometry: dict, place_name: str) -> Union[list, None]: 315 """ 316 Extracts geographic coordinates from the WHG API response. 317 """ 318 319 if geometry.get("type") == "GeometryCollection": 320 self.logger.warning(f"Best match for '{place_name}' is a GeometryCollection. Taking the first valid point.") 321 322 coordinates = None 323 for geom in geometry.get("geometries", []): 324 if geom.get("type") == "Point": 325 coordinates = geom.get("coordinates") 326 break 327 if not coordinates: 328 self.logger.warning(f"No valid Point found in GeometryCollection for '{place_name}'.") 329 return None 330 else: 331 return geometry.get("coordinates", [])
Extracts geographic coordinates from the WHG API response.
367class TGNQuery(BaseQuery): 368 """ 369 A class to interact with the Getty Thesaurus of Geographic Names (TGN) SPARQL endpoint. 370 371 This class provides methods to search and retrieve geographic coordinates for places 372 using the Getty TGN linked open data service. It supports fuzzy matching of place names 373 and filtering by country and place type. 374 375 Attributes: 376 sparql (SPARQLWrapper): SPARQL endpoint wrapper instance for TGN queries 377 lang (str): Language code for the place type (default: "en") 378 379 Example: 380 >>> tgn = TGNQuery("http://vocab.getty.edu/sparql") 381 >>> results = tgn.places_by_name("Madrid", "Spain", "ciudad") 382 >>> coordinates = tgn.get_best_match(results, "Madrid") 383 """ 384 def __init__(self): 385 super().__init__(base_url=TGN_ENDPOINT) 386 self.sparql = SPARQLWrapper(self.base_url) 387 self.sparql.setReturnFormat(JSON) 388 389 @sleep_and_retry 390 @limits(calls=10, period=1) 391 def places_by_name(self, place_name: str, country_code: Optional[str], place_type: Optional[str] = None, lang: Optional[str] = "en") -> Union[dict, list]: 392 """ 393 Search for places using the TGN SPARQL endpoint. 394 395 Parameters: 396 place_name (str): Name of the place to search for 397 country_code (str): Country code or name 398 place_type (str): Optional type of place (e.g., 'ciudad', 'pueblo') 399 """ 400 401 country_name = "" 402 403 if country_code: 404 country = pycountry.countries.get(alpha_2=country_code) 405 if country: 406 country_name = country.name 407 else: 408 country_name = country_code 409 410 type_filter = f'?p gvp:placeType [rdfs:label "{place_type}"@{lang}].' if place_type else '' 411 412 query = f""" 413 PREFIX skos: <http://www.w3.org/2004/02/skos/core#> 414 PREFIX luc: <http://www.ontotext.com/owlim/lucene#> 415 PREFIX gvp: <http://vocab.getty.edu/ontology#> 416 PREFIX xl: <http://www.w3.org/2008/05/skos-xl#> 417 PREFIX tgn: <http://vocab.getty.edu/tgn/> 418 PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#> 419 420 SELECT DISTINCT ?p ?pLab ?context WHERE {{ 421 ?p skos:inScheme tgn:; 422 luc:term "{place_name}"; 423 gvp:prefLabelGVP [xl:literalForm ?pLab]; 424 gvp:parentString ?context. 425 426 {type_filter} 427 428 FILTER(CONTAINS(?context, "{country_name}")) 429 }} 430 """ 431 432 self.logger.debug(f"Executing SPARQL {query} for TGN with place name '{place_name}' and country code '{country_code}'") 433 434 try: 435 self.sparql.setQuery(query) 436 results = self.sparql.query().convert() 437 self.logger.debug(f"SPARQL query results for '{place_name}': {results}") 438 439 if isinstance(results, dict) and "results" in results and "bindings" in results["results"]: 440 return results["results"]["bindings"] 441 else: 442 self.logger.error(f"Unexpected SPARQL result format for '{place_name}': {results}") 443 return [] 444 except Exception as e: 445 self.logger.error(f"Error querying TGN for '{place_name}': {str(e)}") 446 return [] 447 448 def get_coordinates_lod_json(self, data: dict) -> dict: 449 450 for item in data.get("identified_by", []): 451 if item.get("type") == "crm:E47_Spatial_Coordinates": 452 coords = ast.literal_eval(item.get("value")) 453 if isinstance(coords, list) and len(coords) == 2: 454 lon, lat = coords 455 return {"latitude": lat, "longitude": lon} 456 return {"latitude": None, "longitude": None} 457 458 def _post_filtering( 459 self, 460 tgn_uri: str, 461 place_name: str, 462 fuzzy_threshold: float, 463 confidence: float, 464 lang: Optional[str] = "en") -> dict: 465 466 json_url = tgn_uri + ".json" 467 try: 468 response = self._limited_get(json_url) 469 results = response.json() 470 except Exception as e: 471 self.logger.error(f"Error fetching TGN data for {place_name}: {e}") 472 return {} 473 474 coordinates = self.get_coordinates_lod_json(results) 475 if coordinates["latitude"] is None or coordinates["longitude"] is None: 476 self.logger.warning(f"No valid coordinates found for {place_name} in TGN results.") 477 478 479 return { 480 "place": place_name, 481 "standardize_label": results.get("_label", ""), 482 "language": lang, 483 "latitude": float(coordinates["latitude"]), 484 "longitude": float(coordinates["longitude"]), 485 "source": "TGN", 486 "id": results.get("id", ""), 487 "uri": results.get("id", ""), 488 "country_code": "", 489 "part_of": results.get("part_of", [{}])[0].get("_label", ""), 490 "part_of_uri": results.get("part_of", [{}])[0].get("id", ""), 491 "confidence": confidence, 492 "threshold": fuzzy_threshold, 493 "match_type": "exact" if confidence == 100 else "fuzzy" 494 } 495 496 def get_best_match(self, results: Union[dict, list], place_name: str, fuzzy_threshold: float, lang: Optional[str] = "en") -> Union[dict, None]: 497 if not results: 498 self.logger.debug(f"No results found for '{place_name}' in TGN.") 499 return None 500 501 self.logger.debug(f"Finding best match for '{place_name}' in TGN {results}") 502 503 if len(results) == 1: 504 return self._post_filtering(results[0].get("p", {}).get("value", ""), 505 place_name=place_name, 506 fuzzy_threshold=fuzzy_threshold, 507 confidence=100, 508 lang=lang) 509 510 for r in results: 511 label = r.get("pLab", {}).get("value", "") 512 uri = r.get("p", {}).get("value", "") 513 ratio = fuzz.ratio(label.lower(), place_name.lower()) 514 self.logger.info(f"Comparing '{label}' with '{place_name} {uri}': {ratio}% similarity") 515 if ratio >= fuzzy_threshold: 516 self.logger.info(f"Best match for '{place_name}': {label} ({ratio}%)") 517 return self._post_filtering(uri, 518 place_name=place_name, 519 fuzzy_threshold=fuzzy_threshold, 520 confidence=ratio, 521 lang=lang) 522 523 self.logger.debug(f"No suitable match found for '{place_name}' in TGN.") 524 return None
A class to interact with the Getty Thesaurus of Geographic Names (TGN) SPARQL endpoint.
This class provides methods to search and retrieve geographic coordinates for places using the Getty TGN linked open data service. It supports fuzzy matching of place names and filtering by country and place type.
Attributes: sparql (SPARQLWrapper): SPARQL endpoint wrapper instance for TGN queries lang (str): Language code for the place type (default: "en")
Example:
tgn = TGNQuery("http://vocab.getty.edu/sparql") results = tgn.places_by_name("Madrid", "Spain", "ciudad") coordinates = tgn.get_best_match(results, "Madrid")
389 @sleep_and_retry 390 @limits(calls=10, period=1) 391 def places_by_name(self, place_name: str, country_code: Optional[str], place_type: Optional[str] = None, lang: Optional[str] = "en") -> Union[dict, list]: 392 """ 393 Search for places using the TGN SPARQL endpoint. 394 395 Parameters: 396 place_name (str): Name of the place to search for 397 country_code (str): Country code or name 398 place_type (str): Optional type of place (e.g., 'ciudad', 'pueblo') 399 """ 400 401 country_name = "" 402 403 if country_code: 404 country = pycountry.countries.get(alpha_2=country_code) 405 if country: 406 country_name = country.name 407 else: 408 country_name = country_code 409 410 type_filter = f'?p gvp:placeType [rdfs:label "{place_type}"@{lang}].' if place_type else '' 411 412 query = f""" 413 PREFIX skos: <http://www.w3.org/2004/02/skos/core#> 414 PREFIX luc: <http://www.ontotext.com/owlim/lucene#> 415 PREFIX gvp: <http://vocab.getty.edu/ontology#> 416 PREFIX xl: <http://www.w3.org/2008/05/skos-xl#> 417 PREFIX tgn: <http://vocab.getty.edu/tgn/> 418 PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#> 419 420 SELECT DISTINCT ?p ?pLab ?context WHERE {{ 421 ?p skos:inScheme tgn:; 422 luc:term "{place_name}"; 423 gvp:prefLabelGVP [xl:literalForm ?pLab]; 424 gvp:parentString ?context. 425 426 {type_filter} 427 428 FILTER(CONTAINS(?context, "{country_name}")) 429 }} 430 """ 431 432 self.logger.debug(f"Executing SPARQL {query} for TGN with place name '{place_name}' and country code '{country_code}'") 433 434 try: 435 self.sparql.setQuery(query) 436 results = self.sparql.query().convert() 437 self.logger.debug(f"SPARQL query results for '{place_name}': {results}") 438 439 if isinstance(results, dict) and "results" in results and "bindings" in results["results"]: 440 return results["results"]["bindings"] 441 else: 442 self.logger.error(f"Unexpected SPARQL result format for '{place_name}': {results}") 443 return [] 444 except Exception as e: 445 self.logger.error(f"Error querying TGN for '{place_name}': {str(e)}") 446 return []
Search for places using the TGN SPARQL endpoint.
Parameters: place_name (str): Name of the place to search for country_code (str): Country code or name place_type (str): Optional type of place (e.g., 'ciudad', 'pueblo')
448 def get_coordinates_lod_json(self, data: dict) -> dict: 449 450 for item in data.get("identified_by", []): 451 if item.get("type") == "crm:E47_Spatial_Coordinates": 452 coords = ast.literal_eval(item.get("value")) 453 if isinstance(coords, list) and len(coords) == 2: 454 lon, lat = coords 455 return {"latitude": lat, "longitude": lon} 456 return {"latitude": None, "longitude": None}
496 def get_best_match(self, results: Union[dict, list], place_name: str, fuzzy_threshold: float, lang: Optional[str] = "en") -> Union[dict, None]: 497 if not results: 498 self.logger.debug(f"No results found for '{place_name}' in TGN.") 499 return None 500 501 self.logger.debug(f"Finding best match for '{place_name}' in TGN {results}") 502 503 if len(results) == 1: 504 return self._post_filtering(results[0].get("p", {}).get("value", ""), 505 place_name=place_name, 506 fuzzy_threshold=fuzzy_threshold, 507 confidence=100, 508 lang=lang) 509 510 for r in results: 511 label = r.get("pLab", {}).get("value", "") 512 uri = r.get("p", {}).get("value", "") 513 ratio = fuzz.ratio(label.lower(), place_name.lower()) 514 self.logger.info(f"Comparing '{label}' with '{place_name} {uri}': {ratio}% similarity") 515 if ratio >= fuzzy_threshold: 516 self.logger.info(f"Best match for '{place_name}': {label} ({ratio}%)") 517 return self._post_filtering(uri, 518 place_name=place_name, 519 fuzzy_threshold=fuzzy_threshold, 520 confidence=ratio, 521 lang=lang) 522 523 self.logger.debug(f"No suitable match found for '{place_name}' in TGN.") 524 return None
Get the best matching place from the results based on name similarity.
Parameters: results (Union[dict, list]): Results from places_by_name query place_name (str): Original place name to match against fuzzy_threshold (float): Minimum similarity score (0-100) for a match lang (Optional[str]): Language code for place type
Returns: dictionary: A dictionary containing { "place": place_name, "standardize_label": str, "latitude": float, "longitude": float, "source": str, "id": str, "uri": str, "country_code": str, "confidence": float, "threshold": fuzzy_threshold, "match_type": str }
527class WikidataQuery(BaseQuery): 528 """ 529 A class to interact with the Wikidata MediaWiki API for geographic coordinates lookup. 530 """ 531 532 def __init__(self, 533 search_endpoint=WIKIDATA_ENDPOINT, 534 entitydata_endpoint=ENTITYDATA_ENDPOINT): 535 super().__init__(base_url=search_endpoint) 536 self.search_endpoint = search_endpoint 537 self.entitydata_endpoint = entitydata_endpoint 538 539 @sleep_and_retry 540 @limits(calls=30, period=1) 541 def places_by_name(self, 542 place_name: str, 543 country_code: Optional[str], 544 place_type: Optional[str] = None, 545 lang: Optional[str] = "en") -> Union[dict, list]: 546 547 params = { 548 "action": "wbsearchentities", 549 "search": place_name, 550 "language": lang, 551 "format": "json", 552 "type": "item", 553 "limit": 10 554 } 555 556 try: 557 response = self._limited_get(self.search_endpoint, params=params) 558 search_results = response.json().get("search", []) 559 self.logger.debug(f"Wikidata search results for '{place_name}': {search_results}") 560 except Exception as e: 561 self.logger.error(f"Error querying Wikidata for '{place_name}': {e}") 562 return [] 563 564 if not search_results: 565 return [] 566 567 qids = [result.get("id") for result in search_results if result.get("id")] 568 self.logger.debug(f"Found {len(qids)} QIDs for '{place_name}': {qids}") 569 if not qids: 570 return [] 571 572 # Batch fetch entity data for all QIDs 573 entities_data = self._batch_fetch_entities(qids) 574 575 # Extract all country QIDs and administrative entity QIDs to batch fetch them too 576 country_qids = set() 577 admin_qids = set() 578 for qid in entities_data: 579 claims = entities_data[qid].get("claims", {}) 580 try: 581 country_qid = claims.get("P17", [])[0]["mainsnak"]["datavalue"]["value"]["id"] 582 country_qids.add(country_qid) 583 except (IndexError, KeyError): 584 pass 585 try: 586 admin_qid = claims.get("P131", [])[0]["mainsnak"]["datavalue"]["value"]["id"] 587 admin_qids.add(admin_qid) 588 except (IndexError, KeyError): 589 pass 590 591 # Batch fetch country and administrative entity data 592 country_data = {} 593 admin_data = {} 594 if country_qids: 595 country_data = self._batch_fetch_entities(list(country_qids)) 596 if admin_qids: 597 admin_data = self._batch_fetch_entities(list(admin_qids)) 598 599 enriched_results = [] 600 for result in search_results: 601 qid = result.get("id") 602 label = result.get("label", "") 603 604 if qid not in entities_data: 605 continue 606 607 entity_data = entities_data[qid] 608 claims = entity_data.get("claims", {}) 609 610 coords = self._extract_coordinates(claims) 611 if not coords or coords == (None, None): 612 continue 613 614 # Get country info for this place 615 place_country_qid, place_country_iso = self._get_place_country_info(claims, country_data) 616 617 # Get administrative entity info for this place 618 admin_qid, admin_label = self._get_place_admin_info(claims, admin_data, lang) 619 620 if country_code and not self._match_country_optimized(place_country_iso, country_code): 621 continue 622 623 if place_type and not self._match_place_type(claims, place_type): 624 continue 625 626 # Store all needed data for post-filtering 627 enriched_results.append({ 628 "label": label, 629 "qid": qid, 630 "coordinates": coords, 631 "entity_data": entity_data, 632 "claims": claims, 633 "country_qid": place_country_qid, 634 "country_iso": place_country_iso, 635 "admin_qid": admin_qid, 636 "admin_label": admin_label 637 }) 638 639 return enriched_results 640 641 def _batch_fetch_entities(self, qids: List[str]) -> Dict[str, dict]: 642 """ 643 Batch fetch entity data for multiple QIDs using wbgetentities API. 644 This significantly reduces the number of HTTP requests compared to individual fetches. 645 """ 646 entities_data = {} 647 648 # Process QIDs in chunks of 50 (Wikidata API limit) 649 chunk_size = 50 650 for i in range(0, len(qids), chunk_size): 651 chunk = qids[i:i + chunk_size] 652 653 params = { 654 "action": "wbgetentities", 655 "ids": "|".join(chunk), 656 "format": "json", 657 "props": "labels|claims" # Only fetch what we need 658 } 659 660 try: 661 response = self._limited_get(self.search_endpoint, params=params) 662 result = response.json() 663 664 if "entities" in result: 665 entities_data.update(result["entities"]) 666 667 except Exception as e: 668 self.logger.warning(f"Failed to batch fetch entities {chunk}: {e}") 669 # Fallback to individual fetching for this chunk 670 for qid in chunk: 671 entity_data = self._fetch_entity_data(qid) 672 if entity_data: 673 entities_data[qid] = entity_data 674 675 return entities_data 676 677 def _get_place_country_info(self, claims: dict, country_data: Dict[str, dict]) -> tuple: 678 """ 679 Extract country QID and ISO code for a place using pre-fetched country data. 680 Returns (country_qid, country_iso_code) 681 """ 682 try: 683 country_qid = claims.get("P17", [])[0]["mainsnak"]["datavalue"]["value"]["id"] 684 if country_qid in country_data: 685 country_claims = country_data[country_qid].get("claims", {}) 686 iso_code = country_claims.get("P297", [{}])[0].get("mainsnak", {}).get("datavalue", {}).get("value", "") 687 return country_qid, iso_code.upper() if iso_code else "" 688 return country_qid, "" 689 except (IndexError, KeyError): 690 return "", "" 691 692 def _get_place_admin_info(self, claims: dict, admin_data: Dict[str, dict], lang: Optional[str]) -> tuple: 693 """ 694 Extract administrative entity QID and label for a place using pre-fetched admin data. 695 Returns (admin_qid, admin_label) 696 """ 697 try: 698 admin_qid = claims.get("P131", [])[0]["mainsnak"]["datavalue"]["value"]["id"] 699 if admin_qid in admin_data and lang: 700 admin_labels = admin_data[admin_qid].get("labels", {}) 701 admin_label = admin_labels.get(lang, {}).get("value", "") 702 return admin_qid, admin_label 703 return admin_qid, "" 704 except (IndexError, KeyError): 705 return "", "" 706 707 def _match_country_optimized(self, place_country_iso: str, target_country_code: str) -> bool: 708 """ 709 Optimized country matching using pre-extracted ISO codes. 710 """ 711 if not place_country_iso or not target_country_code: 712 return False 713 return place_country_iso.upper() == target_country_code.upper() 714 715 def _fetch_entity_data(self, qid: str) -> dict: 716 try: 717 url = f"{self.entitydata_endpoint}{qid}.json" 718 response = self._limited_get(url) 719 return response.json()["entities"][qid] 720 except Exception as e: 721 self.logger.warning(f"Failed to fetch entity data for {qid}: {e}") 722 return {} 723 724 def get_best_match(self, 725 results: Union[dict, list], 726 place_name: str, 727 fuzzy_threshold: float, 728 lang: Optional[str] = None) -> Union[dict, None]: 729 if not results: 730 return None 731 732 best_score = 0 733 best_result = None 734 735 for result in results: 736 label = result["label"] 737 score = max(fuzz.ratio(label.lower(), place_name.lower()), 738 fuzz.partial_ratio(label.lower(), place_name.lower())) 739 740 if score > best_score and score >= fuzzy_threshold: 741 best_score = score 742 best_result = result 743 self.logger.info(f"Wikidata match: '{label}' → {score}%") 744 745 if best_result: 746 return self._post_filtering( 747 results=best_result, 748 place_name=place_name, 749 fuzzy_threshold=fuzzy_threshold, 750 confidence=best_score, 751 lang=lang 752 ) 753 754 return None 755 756 def _post_filtering(self, 757 results: dict, 758 place_name: str, 759 fuzzy_threshold: float, 760 confidence: float, 761 lang: Optional[str] = "en") -> dict: 762 """ 763 Returns the dictionary customized to the Wikidata API results. 764 """ 765 qid = results.get("qid", "") 766 label = results.get("label", "") 767 coords = results.get("coordinates", (None, None)) 768 entity_data = results.get("entity_data", {}) 769 claims = results.get("claims", {}) 770 771 # Use pre-extracted country and administrative entity information 772 country_code = results.get("country_iso", "") 773 admin_qid = results.get("admin_qid", "") 774 admin_label = results.get("admin_label", "") 775 776 # Build part_of_uri if we have admin_qid 777 part_of_uri = f"https://www.wikidata.org/entity/{admin_qid}" if admin_qid else "" 778 779 return { 780 "place": place_name, 781 "standardize_label": label, 782 "language": lang, 783 "latitude": float(coords[0]) if coords[0] is not None else None, 784 "longitude": float(coords[1]) if coords[1] is not None else None, 785 "source": "Wikidata", 786 "id": qid, 787 "uri": f"https://www.wikidata.org/entity/{qid}", 788 "country_code": country_code, 789 "part_of": admin_label, 790 "part_of_uri": part_of_uri, 791 "confidence": confidence, 792 "threshold": fuzzy_threshold, 793 "match_type": "exact" if confidence == 100 else "fuzzy" 794 } 795 796 def _extract_coordinates(self, claims: dict) -> tuple: 797 try: 798 coord_data = claims.get("P625", [])[0]["mainsnak"]["datavalue"]["value"] 799 return coord_data["latitude"], coord_data["longitude"] 800 except Exception: 801 return (None, None) 802 803 def _match_country(self, claims: dict, iso_code: str) -> bool: 804 # DEPRECATED: Use _match_country_optimized instead 805 # This method is kept for backward compatibility but should not be used 806 # in the optimized workflow as it makes individual HTTP requests 807 try: 808 country_entity = claims.get("P17", [])[0]["mainsnak"]["datavalue"]["value"]["id"] 809 url = f"{self.entitydata_endpoint}{country_entity}.json" 810 response = self._limited_get(url) 811 country_data = response.json() 812 wikidata_iso = country_data["entities"][country_entity]["claims"]["P297"][0]["mainsnak"]["datavalue"]["value"] 813 return wikidata_iso.upper() == iso_code.upper() 814 except Exception: 815 return False 816 817 def _match_place_type(self, claims: dict, expected_qid: str) -> bool: 818 try: 819 types = [c["mainsnak"]["datavalue"]["value"]["id"] for c in claims.get("P31", [])] 820 return expected_qid in types 821 except Exception: 822 return False
A class to interact with the Wikidata MediaWiki API for geographic coordinates lookup.
539 @sleep_and_retry 540 @limits(calls=30, period=1) 541 def places_by_name(self, 542 place_name: str, 543 country_code: Optional[str], 544 place_type: Optional[str] = None, 545 lang: Optional[str] = "en") -> Union[dict, list]: 546 547 params = { 548 "action": "wbsearchentities", 549 "search": place_name, 550 "language": lang, 551 "format": "json", 552 "type": "item", 553 "limit": 10 554 } 555 556 try: 557 response = self._limited_get(self.search_endpoint, params=params) 558 search_results = response.json().get("search", []) 559 self.logger.debug(f"Wikidata search results for '{place_name}': {search_results}") 560 except Exception as e: 561 self.logger.error(f"Error querying Wikidata for '{place_name}': {e}") 562 return [] 563 564 if not search_results: 565 return [] 566 567 qids = [result.get("id") for result in search_results if result.get("id")] 568 self.logger.debug(f"Found {len(qids)} QIDs for '{place_name}': {qids}") 569 if not qids: 570 return [] 571 572 # Batch fetch entity data for all QIDs 573 entities_data = self._batch_fetch_entities(qids) 574 575 # Extract all country QIDs and administrative entity QIDs to batch fetch them too 576 country_qids = set() 577 admin_qids = set() 578 for qid in entities_data: 579 claims = entities_data[qid].get("claims", {}) 580 try: 581 country_qid = claims.get("P17", [])[0]["mainsnak"]["datavalue"]["value"]["id"] 582 country_qids.add(country_qid) 583 except (IndexError, KeyError): 584 pass 585 try: 586 admin_qid = claims.get("P131", [])[0]["mainsnak"]["datavalue"]["value"]["id"] 587 admin_qids.add(admin_qid) 588 except (IndexError, KeyError): 589 pass 590 591 # Batch fetch country and administrative entity data 592 country_data = {} 593 admin_data = {} 594 if country_qids: 595 country_data = self._batch_fetch_entities(list(country_qids)) 596 if admin_qids: 597 admin_data = self._batch_fetch_entities(list(admin_qids)) 598 599 enriched_results = [] 600 for result in search_results: 601 qid = result.get("id") 602 label = result.get("label", "") 603 604 if qid not in entities_data: 605 continue 606 607 entity_data = entities_data[qid] 608 claims = entity_data.get("claims", {}) 609 610 coords = self._extract_coordinates(claims) 611 if not coords or coords == (None, None): 612 continue 613 614 # Get country info for this place 615 place_country_qid, place_country_iso = self._get_place_country_info(claims, country_data) 616 617 # Get administrative entity info for this place 618 admin_qid, admin_label = self._get_place_admin_info(claims, admin_data, lang) 619 620 if country_code and not self._match_country_optimized(place_country_iso, country_code): 621 continue 622 623 if place_type and not self._match_place_type(claims, place_type): 624 continue 625 626 # Store all needed data for post-filtering 627 enriched_results.append({ 628 "label": label, 629 "qid": qid, 630 "coordinates": coords, 631 "entity_data": entity_data, 632 "claims": claims, 633 "country_qid": place_country_qid, 634 "country_iso": place_country_iso, 635 "admin_qid": admin_qid, 636 "admin_label": admin_label 637 }) 638 639 return enriched_results
Search for places by name. Must be implemented by subclasses.
Parameters: place_name (str): Name of the place to search for country_code (Optional[str]): ISO 3166-1 alpha-2 country code place_type (Optional[str]): Optional place type filter lang (Optional[str]): Language code for place type
Returns: Union[dict, list]: Search results in service-specific format
724 def get_best_match(self, 725 results: Union[dict, list], 726 place_name: str, 727 fuzzy_threshold: float, 728 lang: Optional[str] = None) -> Union[dict, None]: 729 if not results: 730 return None 731 732 best_score = 0 733 best_result = None 734 735 for result in results: 736 label = result["label"] 737 score = max(fuzz.ratio(label.lower(), place_name.lower()), 738 fuzz.partial_ratio(label.lower(), place_name.lower())) 739 740 if score > best_score and score >= fuzzy_threshold: 741 best_score = score 742 best_result = result 743 self.logger.info(f"Wikidata match: '{label}' → {score}%") 744 745 if best_result: 746 return self._post_filtering( 747 results=best_result, 748 place_name=place_name, 749 fuzzy_threshold=fuzzy_threshold, 750 confidence=best_score, 751 lang=lang 752 ) 753 754 return None
Get the best matching place from the results based on name similarity.
Parameters: results (Union[dict, list]): Results from places_by_name query place_name (str): Original place name to match against fuzzy_threshold (float): Minimum similarity score (0-100) for a match lang (Optional[str]): Language code for place type
Returns: dictionary: A dictionary containing { "place": place_name, "standardize_label": str, "latitude": float, "longitude": float, "source": str, "id": str, "uri": str, "country_code": str, "confidence": float, "threshold": fuzzy_threshold, "match_type": str }