georesolver

1from .resolver import PlaceResolver, GeoNamesQuery, WHGQuery, TGNQuery, WikidataQuery
2
3__all__ = ["PlaceResolver", "GeoNamesQuery", "WHGQuery", "TGNQuery", "WikidataQuery"]
class PlaceResolver:
 825class PlaceResolver:
 826    """
 827    A unified resolver that queries multiple geolocation services in order
 828    and returns the first match with valid coordinates.
 829
 830    Args:
 831            services (Optional[List[BaseQuery]]): List of geolocation service instances to use.
 832            places_map_json (Union[str, None]): Path to a custom places mapping JSON file.
 833            lang (Optional[str]): Language code for place type filtering.
 834            threshold (float): Fuzzy matching threshold for place name similarity.
 835            flexible_threshold (bool): If True, use a lower threshold for shorter place names.
 836            flexible_threshold_value (float): The threshold value to use when flexible_threshold is True.
 837                                                If no value is provided, it defaults to 70.
 838            verbose (bool): If True, enable verbose logging.
 839
 840    """
 841    def __init__(self, 
 842                 services: Optional[List[BaseQuery]] = None, 
 843                 places_map_json: Union[str, None] = None, 
 844                 lang: Optional[str] = None, 
 845                 threshold: float = 90,
 846                 flexible_threshold: bool = False,
 847                 flexible_threshold_value: float = 70, 
 848                 verbose: bool = False):
 849
 850        self.logger = setup_logger(self.__class__.__name__, verbose)
 851        
 852        if services is None or not isinstance(services, list) or len(services) == 0:
 853            services = [
 854                GeoNamesQuery(),
 855                WHGQuery(),
 856                TGNQuery(),
 857                WikidataQuery()
 858            ]
 859
 860        self.services = services
 861        self.places_map = self._load_places_map(places_map_json)
 862        self.lang = lang if lang else "en"
 863
 864        if not (0 <= threshold <= 100):
 865            raise ValueError("threshold must be between 0 and 100")
 866        
 867        self.threshold = threshold
 868
 869        self.flexible_threshold = flexible_threshold
 870        if self.flexible_threshold:
 871            if not (0 <= flexible_threshold_value <= 100):
 872                raise ValueError("flexible_threshold_value must be between 0 and 100")
 873            
 874            self.flexible_threshold_value = flexible_threshold_value
 875        
 876        
 877
 878        for service in self.services:
 879            service.logger = setup_logger(service.__class__.__name__, verbose)
 880            self.logger.debug(f"Updated logger for {service.__class__.__name__} with verbose={verbose}")
 881
 882    def _load_places_map(self, custom_path=None):
 883        try:
 884            if custom_path:
 885                with open(custom_path, "r", encoding="utf-8") as f:
 886                    return json.load(f)
 887            else:
 888                resource_path = files("georesolver").joinpath("data/mappings/places_map.json")
 889                with resource_path.open("r", encoding="utf-8") as f:
 890                    return json.load(f)
 891        except Exception as e:
 892            self.logger.error(f"Error loading places map: {e}")
 893            return {}
 894
 895
 896    def resolve(self, 
 897                place_name: str, 
 898                country_code: Union[str, None] = None, 
 899                place_type: Union[str, None] = None,
 900               use_default_filter: bool = False) -> Union[dict, None]:
 901        """
 902        Try resolving the place coordinates using multiple sources.
 903
 904        Args:
 905            place_name (str): The place name to search
 906            country_code (str): ISO 3166-1 alpha-2 country code (optional)
 907            place_type (str): Place type (optional)
 908            use_default_filter (bool): If True, apply a default filter as fallback in case the place_type is not found.
 909                                        If no place_type is provided, no filtering will be applied.
 910
 911        Returns:
 912            tuple: (lat, lon) or (None, None) if not found
 913        """
 914
 915        if not place_name or not isinstance(place_name, str):
 916            self.logger.error("place_name must be a non-empty string")
 917            return None
 918
 919        place_name = place_name.strip()
 920
 921        if pycountry.countries.get(alpha_2=country_code) is None and country_code is not None:
 922            self.logger.warning(f"Invalid country code: {country_code}\nLook at the correct ISO 3166-1 alpha-2 country codes at https://www.iso.org/iso-3166-country-codes.html")
 923            country_code = None
 924
 925        if self.flexible_threshold and len(place_name) < 5:
 926            self.logger.warning(
 927                f"Using flexible threshold for short place name: '{place_name}'"
 928            )
 929            threshold = self.flexible_threshold_value
 930        else:
 931            threshold = self.threshold
 932
 933        for service in self.services:
 934            try:
 935                self.logger.info(f"Trying {service.__class__.__name__} for '{place_name}'")
 936                mapper = PlaceTypeMapper(self.places_map)
 937                service_key = service.__class__.__name__.lower().replace("query", "")
 938
 939                resolved_type = None
 940
 941                if place_type:
 942                    resolved_type = mapper.get_for_service(place_type, service_key)
 943                    if resolved_type is None and use_default_filter:
 944                        self.logger.warning(
 945                            f"Unrecognized place_type '{place_type}' for service '{service_key}', falling back to 'pueblo'."
 946                        )
 947                        resolved_type = mapper.get_for_service("pueblo", service_key)
 948                    elif resolved_type is None:
 949                        self.logger.debug(
 950                            f"Skipping place_type filter for service '{service_key}' (unrecognized type: '{place_type}')."
 951                        )
 952
 953                results = service.places_by_name(place_name, country_code, resolved_type, lang=self.lang)
 954                result = service.get_best_match(results, place_name, fuzzy_threshold=threshold, lang=self.lang)
 955                if result:
 956                    self.logger.info(f"Resolved '{place_name}' via {service.__class__.__name__}: {result}")
 957                    return result
 958            except Exception as e:
 959                traceback_str = traceback.format_exc()
 960                self.logger.warning(f"{service.__class__.__name__} failed for '{place_name}': {e}\n{traceback_str}")
 961        self.logger.warning(f"Could not resolve '{place_name}' via any service.")
 962        return None
 963
 964    def resolve_batch(
 965            self,
 966            df: pd.DataFrame,
 967            place_column: str = "place_name",
 968            country_column: Union[str, None] = None,
 969            place_type_column: Union[str, None] = None,
 970            use_default_filter: bool = False,
 971            return_df: bool = True,
 972            show_progress: bool = True
 973    ) -> Union[pd.DataFrame, List[dict]]:
 974        """
 975        Resolve coordinates for a batch of places from a DataFrame.
 976
 977        Args:
 978            df (pd.DataFrame): Input DataFrame with place names and optional country/type columns.
 979            place_column (str): Column name for place names.
 980            country_column (str): Column name for country codes (optional).
 981            place_type_column (str): Column name for place types (optional).
 982            return_df (bool): If True, return a DataFrame with separate columns for each attribute. Otherwise, return a list of dictionaries.
 983            show_progress (bool): If True, show a progress bar during processing.
 984
 985        Raises:
 986            ValueError: If the input DataFrame is not valid or required columns are missing.
 987
 988        Returns:
 989            pd.DataFrame: A DataFrame with resolved coordinates and metadata.
 990            List[dict]: A list of dictionaries with resolved coordinates and metadata if return_df is False.
 991        """
 992        #TODO: 
 993        # - Gently handle NaN and empty strings in place_column
 994        # - Process data in chunks of 100 rows
 995        # - Only process records with valid place names (non-empty strings)
 996        # - Sort Series 
 997
 998        if not isinstance(df, pd.DataFrame):
 999            raise ValueError("Input must be a pandas DataFrame")
1000
1001        if place_column not in df.columns:
1002            raise ValueError(f"Column '{place_column}' not found in DataFrame")
1003
1004        if country_column and country_column not in df.columns:
1005            raise ValueError(f"Column '{country_column}' not found in DataFrame")
1006
1007        if place_type_column and place_type_column not in df.columns:
1008            raise ValueError(f"Column '{place_type_column}' not found in DataFrame")
1009        
1010        if show_progress:
1011            df_iter = tqdm(df.iterrows(), total=len(df))
1012        else:
1013            df_iter = df.iterrows()
1014
1015        results = []
1016        for _, row in df_iter:
1017            place_name = row.get(place_column, "")
1018            country_code = row.get(country_column) if country_column else None
1019            place_type = row.get(place_type_column) if place_type_column else None
1020
1021            coords = self.resolve(
1022                place_name=place_name,
1023                country_code=country_code,
1024                place_type=place_type,
1025                use_default_filter=use_default_filter
1026            )
1027
1028            results.append(coords)
1029
1030        if return_df:
1031            return pd.DataFrame(results, columns=["place", "standardize_label", "language", "latitude", "longitude", "source", "place_id", "place_uri", "country_code", "part_of", "part_of_uri", "confidence", "threshold", "match_type"], index=df.index)
1032        else:
1033            return results

A unified resolver that queries multiple geolocation services in order and returns the first match with valid coordinates.

Args: services (Optional[List[BaseQuery]]): List of geolocation service instances to use. places_map_json (Union[str, None]): Path to a custom places mapping JSON file. lang (Optional[str]): Language code for place type filtering. threshold (float): Fuzzy matching threshold for place name similarity. flexible_threshold (bool): If True, use a lower threshold for shorter place names. flexible_threshold_value (float): The threshold value to use when flexible_threshold is True. If no value is provided, it defaults to 70. verbose (bool): If True, enable verbose logging.

PlaceResolver( services: Optional[List[georesolver.base.BaseQuery]] = None, places_map_json: Optional[str] = None, lang: Optional[str] = None, threshold: float = 90, flexible_threshold: bool = False, flexible_threshold_value: float = 70, verbose: bool = False)
841    def __init__(self, 
842                 services: Optional[List[BaseQuery]] = None, 
843                 places_map_json: Union[str, None] = None, 
844                 lang: Optional[str] = None, 
845                 threshold: float = 90,
846                 flexible_threshold: bool = False,
847                 flexible_threshold_value: float = 70, 
848                 verbose: bool = False):
849
850        self.logger = setup_logger(self.__class__.__name__, verbose)
851        
852        if services is None or not isinstance(services, list) or len(services) == 0:
853            services = [
854                GeoNamesQuery(),
855                WHGQuery(),
856                TGNQuery(),
857                WikidataQuery()
858            ]
859
860        self.services = services
861        self.places_map = self._load_places_map(places_map_json)
862        self.lang = lang if lang else "en"
863
864        if not (0 <= threshold <= 100):
865            raise ValueError("threshold must be between 0 and 100")
866        
867        self.threshold = threshold
868
869        self.flexible_threshold = flexible_threshold
870        if self.flexible_threshold:
871            if not (0 <= flexible_threshold_value <= 100):
872                raise ValueError("flexible_threshold_value must be between 0 and 100")
873            
874            self.flexible_threshold_value = flexible_threshold_value
875        
876        
877
878        for service in self.services:
879            service.logger = setup_logger(service.__class__.__name__, verbose)
880            self.logger.debug(f"Updated logger for {service.__class__.__name__} with verbose={verbose}")
logger
services
places_map
lang
threshold
flexible_threshold
def resolve( self, place_name: str, country_code: Optional[str] = None, place_type: Optional[str] = None, use_default_filter: bool = False) -> Optional[dict]:
896    def resolve(self, 
897                place_name: str, 
898                country_code: Union[str, None] = None, 
899                place_type: Union[str, None] = None,
900               use_default_filter: bool = False) -> Union[dict, None]:
901        """
902        Try resolving the place coordinates using multiple sources.
903
904        Args:
905            place_name (str): The place name to search
906            country_code (str): ISO 3166-1 alpha-2 country code (optional)
907            place_type (str): Place type (optional)
908            use_default_filter (bool): If True, apply a default filter as fallback in case the place_type is not found.
909                                        If no place_type is provided, no filtering will be applied.
910
911        Returns:
912            tuple: (lat, lon) or (None, None) if not found
913        """
914
915        if not place_name or not isinstance(place_name, str):
916            self.logger.error("place_name must be a non-empty string")
917            return None
918
919        place_name = place_name.strip()
920
921        if pycountry.countries.get(alpha_2=country_code) is None and country_code is not None:
922            self.logger.warning(f"Invalid country code: {country_code}\nLook at the correct ISO 3166-1 alpha-2 country codes at https://www.iso.org/iso-3166-country-codes.html")
923            country_code = None
924
925        if self.flexible_threshold and len(place_name) < 5:
926            self.logger.warning(
927                f"Using flexible threshold for short place name: '{place_name}'"
928            )
929            threshold = self.flexible_threshold_value
930        else:
931            threshold = self.threshold
932
933        for service in self.services:
934            try:
935                self.logger.info(f"Trying {service.__class__.__name__} for '{place_name}'")
936                mapper = PlaceTypeMapper(self.places_map)
937                service_key = service.__class__.__name__.lower().replace("query", "")
938
939                resolved_type = None
940
941                if place_type:
942                    resolved_type = mapper.get_for_service(place_type, service_key)
943                    if resolved_type is None and use_default_filter:
944                        self.logger.warning(
945                            f"Unrecognized place_type '{place_type}' for service '{service_key}', falling back to 'pueblo'."
946                        )
947                        resolved_type = mapper.get_for_service("pueblo", service_key)
948                    elif resolved_type is None:
949                        self.logger.debug(
950                            f"Skipping place_type filter for service '{service_key}' (unrecognized type: '{place_type}')."
951                        )
952
953                results = service.places_by_name(place_name, country_code, resolved_type, lang=self.lang)
954                result = service.get_best_match(results, place_name, fuzzy_threshold=threshold, lang=self.lang)
955                if result:
956                    self.logger.info(f"Resolved '{place_name}' via {service.__class__.__name__}: {result}")
957                    return result
958            except Exception as e:
959                traceback_str = traceback.format_exc()
960                self.logger.warning(f"{service.__class__.__name__} failed for '{place_name}': {e}\n{traceback_str}")
961        self.logger.warning(f"Could not resolve '{place_name}' via any service.")
962        return None

Try resolving the place coordinates using multiple sources.

Args: place_name (str): The place name to search country_code (str): ISO 3166-1 alpha-2 country code (optional) place_type (str): Place type (optional) use_default_filter (bool): If True, apply a default filter as fallback in case the place_type is not found. If no place_type is provided, no filtering will be applied.

Returns: tuple: (lat, lon) or (None, None) if not found

def resolve_batch( self, df: pandas.core.frame.DataFrame, place_column: str = 'place_name', country_column: Optional[str] = None, place_type_column: Optional[str] = None, use_default_filter: bool = False, return_df: bool = True, show_progress: bool = True) -> Union[pandas.core.frame.DataFrame, List[dict]]:
 964    def resolve_batch(
 965            self,
 966            df: pd.DataFrame,
 967            place_column: str = "place_name",
 968            country_column: Union[str, None] = None,
 969            place_type_column: Union[str, None] = None,
 970            use_default_filter: bool = False,
 971            return_df: bool = True,
 972            show_progress: bool = True
 973    ) -> Union[pd.DataFrame, List[dict]]:
 974        """
 975        Resolve coordinates for a batch of places from a DataFrame.
 976
 977        Args:
 978            df (pd.DataFrame): Input DataFrame with place names and optional country/type columns.
 979            place_column (str): Column name for place names.
 980            country_column (str): Column name for country codes (optional).
 981            place_type_column (str): Column name for place types (optional).
 982            return_df (bool): If True, return a DataFrame with separate columns for each attribute. Otherwise, return a list of dictionaries.
 983            show_progress (bool): If True, show a progress bar during processing.
 984
 985        Raises:
 986            ValueError: If the input DataFrame is not valid or required columns are missing.
 987
 988        Returns:
 989            pd.DataFrame: A DataFrame with resolved coordinates and metadata.
 990            List[dict]: A list of dictionaries with resolved coordinates and metadata if return_df is False.
 991        """
 992        #TODO: 
 993        # - Gently handle NaN and empty strings in place_column
 994        # - Process data in chunks of 100 rows
 995        # - Only process records with valid place names (non-empty strings)
 996        # - Sort Series 
 997
 998        if not isinstance(df, pd.DataFrame):
 999            raise ValueError("Input must be a pandas DataFrame")
1000
1001        if place_column not in df.columns:
1002            raise ValueError(f"Column '{place_column}' not found in DataFrame")
1003
1004        if country_column and country_column not in df.columns:
1005            raise ValueError(f"Column '{country_column}' not found in DataFrame")
1006
1007        if place_type_column and place_type_column not in df.columns:
1008            raise ValueError(f"Column '{place_type_column}' not found in DataFrame")
1009        
1010        if show_progress:
1011            df_iter = tqdm(df.iterrows(), total=len(df))
1012        else:
1013            df_iter = df.iterrows()
1014
1015        results = []
1016        for _, row in df_iter:
1017            place_name = row.get(place_column, "")
1018            country_code = row.get(country_column) if country_column else None
1019            place_type = row.get(place_type_column) if place_type_column else None
1020
1021            coords = self.resolve(
1022                place_name=place_name,
1023                country_code=country_code,
1024                place_type=place_type,
1025                use_default_filter=use_default_filter
1026            )
1027
1028            results.append(coords)
1029
1030        if return_df:
1031            return pd.DataFrame(results, columns=["place", "standardize_label", "language", "latitude", "longitude", "source", "place_id", "place_uri", "country_code", "part_of", "part_of_uri", "confidence", "threshold", "match_type"], index=df.index)
1032        else:
1033            return results

Resolve coordinates for a batch of places from a DataFrame.

Args: df (pd.DataFrame): Input DataFrame with place names and optional country/type columns. place_column (str): Column name for place names. country_column (str): Column name for country codes (optional). place_type_column (str): Column name for place types (optional). return_df (bool): If True, return a DataFrame with separate columns for each attribute. Otherwise, return a list of dictionaries. show_progress (bool): If True, show a progress bar during processing.

Raises: ValueError: If the input DataFrame is not valid or required columns are missing.

Returns: pd.DataFrame: A DataFrame with resolved coordinates and metadata. List[dict]: A list of dictionaries with resolved coordinates and metadata if return_df is False.

class GeoNamesQuery(georesolver.base.BaseQuery):
 40class GeoNamesQuery(BaseQuery):
 41    """
 42    A class to interact with the GeoNames API.
 43
 44    This class provides methods to search and retrieve geographic coordinates for places
 45    using the GeoNames API. It supports filtering by country and feature class.
 46
 47    Attributes:
 48        endpoint (str): The base URL for the GeoNames API
 49        username (str): GeoNames API username for authentication
 50
 51    Example:
 52        >>> geonames = GeoNamesQuery("http://api.geonames.org", username="your_username")
 53        >>> results = geonames.places_by_name("Madrid", country="ES")
 54        >>> coordinates = geonames.get_best_match(results, "Madrid")
 55    """
 56    def __init__(self, geonames_username: Union[str, None] = None):
 57        super().__init__(base_url=GEONAMES_ENDPOINT)
 58        if geonames_username:
 59            self.username = geonames_username
 60        else:
 61            self.username = os.getenv("GEONAMES_USERNAME")
 62        if not self.username:
 63            raise ValueError("GeoNames username must be provided either as an argument or via the GEONAMES_USERNAME environment variable.")
 64
 65    def places_by_name(self, place_name: str, country_code: Optional[str], place_type: Optional[str] = None, lang: Optional[str] = None) -> dict:
 66        """
 67        Search for places using the GeoNames API.
 68        
 69        Parameters:
 70            place_name (str): Name of the place to search for
 71            country_code (str): Optional ISO 3166-1 alpha-2 country code
 72            place_type (str): Optional feature class (A: country, P: city/village, etc.).
 73                              Additional types can be added in the data/mappings/geonames_place_map.json file.
 74        """
 75
 76        params = {
 77            'q': place_name,
 78            'username': self.username,
 79            'maxRows': 10,
 80            'type': 'json',
 81            'style': 'FULL'
 82        }
 83        
 84        if country_code:
 85            params['country'] = country_code
 86        
 87        if place_type:
 88            params['featureClass'] = place_type.lower()
 89
 90        try:
 91            response = self._limited_get(
 92                "/searchJSON",
 93                params=params
 94            )
 95            return response.json()
 96        except Exception as e:
 97            self.logger.error(f"Error querying GeoNames for '{place_name}': {str(e)}")
 98            return {"geonames": []}
 99        
100    def _post_filtering(
101        self,
102        results: dict,
103        place_name: str,
104        fuzzy_threshold: float,
105        confidence: float,
106        lang: Optional[str] = "en") -> dict:
107        """
108        Returns the dictionary customized to the GeoNames API results.
109        """
110
111        standardize_label = ""
112
113        if lang:
114            self.logger.info(f"Post-filtering GeoNames results for '{place_name}' with language '{lang}'")
115
116            standardize_label = next((name for name in results.get("alternateNames", []) if name["lang"] == lang), {}).get("name", "")
117
118            if not standardize_label:
119                standardize_label = results["toponymName"]
120
121        return {
122                "place": place_name,
123                "standardize_label": standardize_label,
124                "language": lang,
125                "latitude": float(results["lat"]),
126                "longitude": float(results["lng"]),
127                "source": "GeoNames",
128                "id": results["geonameId"],
129                "uri": f"http://sws.geonames.org/{results['geonameId']}/",
130                "country_code": results.get("countryCode", ""),
131                "part_of": "",
132                "part_of_uri": "",
133                "confidence": confidence,
134                "threshold": fuzzy_threshold,
135                "match_type": "exact" if confidence == 100 else "fuzzy"
136            }
137        
138
139    def get_best_match(self, results: Union[dict, list], place_name: str, fuzzy_threshold: float, lang: Optional[str] = None) -> Union[dict, None]:
140        """
141        Get the best matching place from the results based on name similarity.
142        
143        Parameters:
144            results (Union[dict, list]): Results from places_by_name query
145            place_name (str): Original place name to match against
146            fuzzy_threshold (float): Minimum similarity score (0-100) for a match
147        
148        Returns:
149            dictionary: A dictionary containing {
150            "place": str, "standardize_label": str, "latitude": float, "longitude": float, "source": "GeoNames", 
151            "id": str, "uri": str, "country_code": str, "confidence": float, "threshold": fuzzy_threshold,
152            "match_type": str
153            }
154        """
155        if not isinstance(results, dict) or not results.get("geonames"):
156            return None
157
158        geonames = results["geonames"]
159        if len(geonames) == 1:
160            result = geonames[0]
161            return self._post_filtering(result, place_name, fuzzy_threshold, 100, lang)
162
163        best_ratio = 0
164        best_coords = None
165        
166        for place in geonames:
167            name = place.get("name", "")
168            alternate_names = place.get("alternateNames", [])
169            all_names = [name] + [n.get("name", "") for n in alternate_names]
170            
171            for n in all_names:
172                partial_ratio = fuzz.partial_ratio(place_name.lower(), n.lower())
173                regular_ratio = fuzz.ratio(place_name.lower(), n.lower())
174                ratio = max(partial_ratio, regular_ratio)
175                
176                if ratio > best_ratio:
177                    best_ratio = ratio
178                    best_coords = self._post_filtering(place, place_name, fuzzy_threshold, ratio, lang)
179                    self.logger.info(f"Found match: '{name}' with similarity {ratio}%")
180
181        if best_ratio >= fuzzy_threshold:
182            return best_coords
183        
184        return None

A class to interact with the GeoNames API.

This class provides methods to search and retrieve geographic coordinates for places using the GeoNames API. It supports filtering by country and feature class.

Attributes: endpoint (str): The base URL for the GeoNames API username (str): GeoNames API username for authentication

Example:

geonames = GeoNamesQuery("http://api.geonames.org", username="your_username") results = geonames.places_by_name("Madrid", country="ES") coordinates = geonames.get_best_match(results, "Madrid")

GeoNamesQuery(geonames_username: Optional[str] = None)
56    def __init__(self, geonames_username: Union[str, None] = None):
57        super().__init__(base_url=GEONAMES_ENDPOINT)
58        if geonames_username:
59            self.username = geonames_username
60        else:
61            self.username = os.getenv("GEONAMES_USERNAME")
62        if not self.username:
63            raise ValueError("GeoNames username must be provided either as an argument or via the GEONAMES_USERNAME environment variable.")
def places_by_name( self, place_name: str, country_code: Optional[str], place_type: Optional[str] = None, lang: Optional[str] = None) -> dict:
65    def places_by_name(self, place_name: str, country_code: Optional[str], place_type: Optional[str] = None, lang: Optional[str] = None) -> dict:
66        """
67        Search for places using the GeoNames API.
68        
69        Parameters:
70            place_name (str): Name of the place to search for
71            country_code (str): Optional ISO 3166-1 alpha-2 country code
72            place_type (str): Optional feature class (A: country, P: city/village, etc.).
73                              Additional types can be added in the data/mappings/geonames_place_map.json file.
74        """
75
76        params = {
77            'q': place_name,
78            'username': self.username,
79            'maxRows': 10,
80            'type': 'json',
81            'style': 'FULL'
82        }
83        
84        if country_code:
85            params['country'] = country_code
86        
87        if place_type:
88            params['featureClass'] = place_type.lower()
89
90        try:
91            response = self._limited_get(
92                "/searchJSON",
93                params=params
94            )
95            return response.json()
96        except Exception as e:
97            self.logger.error(f"Error querying GeoNames for '{place_name}': {str(e)}")
98            return {"geonames": []}

Search for places using the GeoNames API.

Parameters: place_name (str): Name of the place to search for country_code (str): Optional ISO 3166-1 alpha-2 country code place_type (str): Optional feature class (A: country, P: city/village, etc.). Additional types can be added in the data/mappings/geonames_place_map.json file.

def get_best_match( self, results: Union[dict, list], place_name: str, fuzzy_threshold: float, lang: Optional[str] = None) -> Optional[dict]:
139    def get_best_match(self, results: Union[dict, list], place_name: str, fuzzy_threshold: float, lang: Optional[str] = None) -> Union[dict, None]:
140        """
141        Get the best matching place from the results based on name similarity.
142        
143        Parameters:
144            results (Union[dict, list]): Results from places_by_name query
145            place_name (str): Original place name to match against
146            fuzzy_threshold (float): Minimum similarity score (0-100) for a match
147        
148        Returns:
149            dictionary: A dictionary containing {
150            "place": str, "standardize_label": str, "latitude": float, "longitude": float, "source": "GeoNames", 
151            "id": str, "uri": str, "country_code": str, "confidence": float, "threshold": fuzzy_threshold,
152            "match_type": str
153            }
154        """
155        if not isinstance(results, dict) or not results.get("geonames"):
156            return None
157
158        geonames = results["geonames"]
159        if len(geonames) == 1:
160            result = geonames[0]
161            return self._post_filtering(result, place_name, fuzzy_threshold, 100, lang)
162
163        best_ratio = 0
164        best_coords = None
165        
166        for place in geonames:
167            name = place.get("name", "")
168            alternate_names = place.get("alternateNames", [])
169            all_names = [name] + [n.get("name", "") for n in alternate_names]
170            
171            for n in all_names:
172                partial_ratio = fuzz.partial_ratio(place_name.lower(), n.lower())
173                regular_ratio = fuzz.ratio(place_name.lower(), n.lower())
174                ratio = max(partial_ratio, regular_ratio)
175                
176                if ratio > best_ratio:
177                    best_ratio = ratio
178                    best_coords = self._post_filtering(place, place_name, fuzzy_threshold, ratio, lang)
179                    self.logger.info(f"Found match: '{name}' with similarity {ratio}%")
180
181        if best_ratio >= fuzzy_threshold:
182            return best_coords
183        
184        return None

Get the best matching place from the results based on name similarity.

Parameters: results (Union[dict, list]): Results from places_by_name query place_name (str): Original place name to match against fuzzy_threshold (float): Minimum similarity score (0-100) for a match

Returns: dictionary: A dictionary containing { "place": str, "standardize_label": str, "latitude": float, "longitude": float, "source": "GeoNames", "id": str, "uri": str, "country_code": str, "confidence": float, "threshold": fuzzy_threshold, "match_type": str }

class WHGQuery(georesolver.base.BaseQuery):
186class WHGQuery(BaseQuery):
187    """
188    A class to interact with the World Historical Gazetteer (WHG) API.
189
190    This class provides methods to search and retrieve geographic coordinates for historical
191    places using the WHG API. It supports filtering by country code and feature class,
192    and includes functionality to find the best matching place from multiple results.
193
194    Attributes:
195        endpoint (str): The base URL for the WHG API
196        search_domain (str): The API endpoint path for searches. Default is "/index"
197        collection (str): The WHG collection to search in (default: "")
198
199    Example:
200        >>> whg = WHGQuery("https://whgazetteer.org/api")
201        >>> results = whg.places_by_name("Cuicatlán", country_code="MX", place_type="p")
202        >>> coordinates = whg.get_best_match(results, place_type="pueblo", country_code="MX")
203    """
204    def __init__(self, 
205                 search_domain: str = "index", 
206                 dataset: str = ""):
207        super().__init__(base_url=WHG_ENDPOINT)
208        self.dataset = dataset
209        self.search_domain = search_domain
210
211    @sleep_and_retry
212    @limits(calls=5, period=1)  # There's no official rate limit for WHG, but we set a conservative limit
213    def places_by_name(self, 
214                       place_name: str, 
215                       country_code: Optional[str], 
216                       place_type: Optional[str] = "p",
217                       lang: Optional[str] = None) -> Union[dict, list]:
218        """
219        Search for place using the World Historical Gazetteer API https://docs.whgazetteer.org/content/400-Technical.html#api
220        
221        Parameters:
222            place_name (str): Any string with the name of the place. This keyword includes place names variants.
223            country_code (str): ISO 3166-1 alpha-2 country code.
224            place_type (str): Feature class according to Linked Places Format. Default is 'p' for place. Look at https://github.com/LinkedPasts/linked-places-format for more places classes.
225        """
226        
227        if not place_type:
228            self.logger.debug("No place_type provided, defaulting to 'p' for place type.")
229            place_type = "p"
230
231        # Build URL with optional country code
232        url = f"{self.base_url}/{self.search_domain}/?name={place_name}&fclass={place_type}&dataset={self.dataset}"
233        if country_code:
234            url += f"&ccodes={country_code}"
235
236        try:
237            response = self._limited_get(url)
238            results = response.json()
239            if country_code:
240                return self._post_filtering_search(results, country_code=country_code)
241            return results
242        except requests.exceptions.RequestException as e:
243            self.logger.error(f"Request error searching for '{place_name}': {str(e)}")
244            return {"features": []}
245        except ValueError as e:
246            self.logger.error(f"Invalid JSON response for '{place_name}': {str(e)}")
247            return {"features": []}
248
249
250    def get_best_match(self, 
251                       results: Union[dict, list], 
252                       place_name: str, 
253                       fuzzy_threshold: float,
254                       lang: Optional[str] = None) -> Union[dict, None]:
255
256        self.logger.info(f"Finding best match for '{place_name}' in WHG results")
257        self.logger.debug(f"Results: {results}")
258
259        try:
260            features = results.get("features", []) if isinstance(results, dict) else []
261            if not features:
262                return None
263
264            for r in features:
265                name = r.get("properties", {}).get("title", "")
266                if not name:
267                    continue
268                
269                ratio = fuzz.ratio(name.lower(), place_name.lower())
270                self.logger.info(f"Comparing '{name}' with '{place_name}': {ratio}% similarity")
271                if ratio >= fuzzy_threshold:
272                    return self._post_filtering(
273                        results=r,
274                        place_name=place_name,
275                        fuzzy_threshold=fuzzy_threshold,
276                        confidence=ratio,
277                        lang=lang
278                    )
279
280            return None
281        
282        except Exception as e:
283            self.logger.error(f"Error processing results: {str(e)}")
284            return None
285
286    def _post_filtering_search(
287    self,
288    results: dict,
289    country_code: Optional[str] = None
290) -> dict:
291        """
292        Post-process the WHG API results to filter by country code. This extra step is necessary
293        because the WHG API does a soft filtering by country code, but it does not guarantee that
294        all results will match the provided country code.
295        """
296        if not results.get("features"):
297            return {"features": []}
298
299        filtered = []
300        for feature in results["features"]:
301            props = feature.get("properties", {})
302            ccodes = props.get("ccodes", [])
303            if len(ccodes) == 0:
304                ccodes = feature.get("ccodes", [])
305
306            # Check country code
307            if country_code and country_code.upper() not in ccodes:
308                continue
309
310            filtered.append(feature)
311
312        return {"features": filtered}
313
314    def get_coordinates_lod_json(self, geometry: dict, place_name: str) -> Union[list, None]:
315        """
316        Extracts geographic coordinates from the WHG API response.
317        """
318
319        if geometry.get("type") == "GeometryCollection":
320            self.logger.warning(f"Best match for '{place_name}' is a GeometryCollection. Taking the first valid point.")
321
322            coordinates = None
323            for geom in geometry.get("geometries", []):
324                if geom.get("type") == "Point":
325                    coordinates = geom.get("coordinates")
326                    break
327            if not coordinates:
328                self.logger.warning(f"No valid Point found in GeometryCollection for '{place_name}'.")
329                return None
330        else:
331            return geometry.get("coordinates", [])
332
333    def _post_filtering(
334            self,
335            results: dict,
336            place_name: str,
337            fuzzy_threshold: float,
338            confidence: float,
339            lang: Optional[str] = "en") -> Union[dict, None]:
340        """
341        Returns the dictionary customized to the WHG API results.
342        """
343        self.logger.debug(f"Post-filtering WHG results for '{place_name}' with language '{lang}'\n{results}")
344
345        geometry = results.get("geometry", {})
346        coordinates = self.get_coordinates_lod_json(geometry, place_name)
347        if coordinates and len(coordinates) == 2:
348            name = results.get("properties", {}).get("title", "")
349            self.logger.info(f"Best match for '{place_name}': {name} ({confidence}%)")
350            return {
351                "place": place_name,
352                "standardize_label": name,
353                "language": lang,
354                "latitude": float(coordinates[1]),
355                "longitude": float(coordinates[0]),
356                "source": "WHG",
357                "id": results.get("properties", {}).get("index_id", ""),
358                "uri": f"https://whgazetteer.org/places/{results.get('properties', {}).get('index_id', '')}/portal/",
359                "country_code": results.get("properties", {}).get("ccodes", [])[0] if results.get("properties", {}).get("ccodes") else "",
360                "part_of": "",
361                "part_of_uri": "",
362                "confidence": confidence,
363                "threshold": fuzzy_threshold,
364                "match_type": "exact" if confidence == 100 else "fuzzy"
365            }

A class to interact with the World Historical Gazetteer (WHG) API.

This class provides methods to search and retrieve geographic coordinates for historical places using the WHG API. It supports filtering by country code and feature class, and includes functionality to find the best matching place from multiple results.

Attributes: endpoint (str): The base URL for the WHG API search_domain (str): The API endpoint path for searches. Default is "/index" collection (str): The WHG collection to search in (default: "")

Example:

whg = WHGQuery("https://whgazetteer.org/api") results = whg.places_by_name("Cuicatlán", country_code="MX", place_type="p") coordinates = whg.get_best_match(results, place_type="pueblo", country_code="MX")

WHGQuery(search_domain: str = 'index', dataset: str = '')
204    def __init__(self, 
205                 search_domain: str = "index", 
206                 dataset: str = ""):
207        super().__init__(base_url=WHG_ENDPOINT)
208        self.dataset = dataset
209        self.search_domain = search_domain
dataset
search_domain
@sleep_and_retry
@limits(calls=5, period=1)
def places_by_name( self, place_name: str, country_code: Optional[str], place_type: Optional[str] = 'p', lang: Optional[str] = None) -> Union[dict, list]:
211    @sleep_and_retry
212    @limits(calls=5, period=1)  # There's no official rate limit for WHG, but we set a conservative limit
213    def places_by_name(self, 
214                       place_name: str, 
215                       country_code: Optional[str], 
216                       place_type: Optional[str] = "p",
217                       lang: Optional[str] = None) -> Union[dict, list]:
218        """
219        Search for place using the World Historical Gazetteer API https://docs.whgazetteer.org/content/400-Technical.html#api
220        
221        Parameters:
222            place_name (str): Any string with the name of the place. This keyword includes place names variants.
223            country_code (str): ISO 3166-1 alpha-2 country code.
224            place_type (str): Feature class according to Linked Places Format. Default is 'p' for place. Look at https://github.com/LinkedPasts/linked-places-format for more places classes.
225        """
226        
227        if not place_type:
228            self.logger.debug("No place_type provided, defaulting to 'p' for place type.")
229            place_type = "p"
230
231        # Build URL with optional country code
232        url = f"{self.base_url}/{self.search_domain}/?name={place_name}&fclass={place_type}&dataset={self.dataset}"
233        if country_code:
234            url += f"&ccodes={country_code}"
235
236        try:
237            response = self._limited_get(url)
238            results = response.json()
239            if country_code:
240                return self._post_filtering_search(results, country_code=country_code)
241            return results
242        except requests.exceptions.RequestException as e:
243            self.logger.error(f"Request error searching for '{place_name}': {str(e)}")
244            return {"features": []}
245        except ValueError as e:
246            self.logger.error(f"Invalid JSON response for '{place_name}': {str(e)}")
247            return {"features": []}

Search for place using the World Historical Gazetteer API https://docs.whgazetteer.org/content/400-Technical.html#api

Parameters: place_name (str): Any string with the name of the place. This keyword includes place names variants. country_code (str): ISO 3166-1 alpha-2 country code. place_type (str): Feature class according to Linked Places Format. Default is 'p' for place. Look at https://github.com/LinkedPasts/linked-places-format for more places classes.

def get_best_match( self, results: Union[dict, list], place_name: str, fuzzy_threshold: float, lang: Optional[str] = None) -> Optional[dict]:
250    def get_best_match(self, 
251                       results: Union[dict, list], 
252                       place_name: str, 
253                       fuzzy_threshold: float,
254                       lang: Optional[str] = None) -> Union[dict, None]:
255
256        self.logger.info(f"Finding best match for '{place_name}' in WHG results")
257        self.logger.debug(f"Results: {results}")
258
259        try:
260            features = results.get("features", []) if isinstance(results, dict) else []
261            if not features:
262                return None
263
264            for r in features:
265                name = r.get("properties", {}).get("title", "")
266                if not name:
267                    continue
268                
269                ratio = fuzz.ratio(name.lower(), place_name.lower())
270                self.logger.info(f"Comparing '{name}' with '{place_name}': {ratio}% similarity")
271                if ratio >= fuzzy_threshold:
272                    return self._post_filtering(
273                        results=r,
274                        place_name=place_name,
275                        fuzzy_threshold=fuzzy_threshold,
276                        confidence=ratio,
277                        lang=lang
278                    )
279
280            return None
281        
282        except Exception as e:
283            self.logger.error(f"Error processing results: {str(e)}")
284            return None

Get the best matching place from the results based on name similarity.

Parameters: results (Union[dict, list]): Results from places_by_name query place_name (str): Original place name to match against fuzzy_threshold (float): Minimum similarity score (0-100) for a match lang (Optional[str]): Language code for place type

Returns: dictionary: A dictionary containing { "place": place_name, "standardize_label": str, "latitude": float, "longitude": float, "source": str, "id": str, "uri": str, "country_code": str, "confidence": float, "threshold": fuzzy_threshold, "match_type": str }

def get_coordinates_lod_json(self, geometry: dict, place_name: str) -> Optional[list]:
314    def get_coordinates_lod_json(self, geometry: dict, place_name: str) -> Union[list, None]:
315        """
316        Extracts geographic coordinates from the WHG API response.
317        """
318
319        if geometry.get("type") == "GeometryCollection":
320            self.logger.warning(f"Best match for '{place_name}' is a GeometryCollection. Taking the first valid point.")
321
322            coordinates = None
323            for geom in geometry.get("geometries", []):
324                if geom.get("type") == "Point":
325                    coordinates = geom.get("coordinates")
326                    break
327            if not coordinates:
328                self.logger.warning(f"No valid Point found in GeometryCollection for '{place_name}'.")
329                return None
330        else:
331            return geometry.get("coordinates", [])

Extracts geographic coordinates from the WHG API response.

class TGNQuery(georesolver.base.BaseQuery):
367class TGNQuery(BaseQuery):
368    """
369    A class to interact with the Getty Thesaurus of Geographic Names (TGN) SPARQL endpoint.
370    
371    This class provides methods to search and retrieve geographic coordinates for places
372    using the Getty TGN linked open data service. It supports fuzzy matching of place names
373    and filtering by country and place type.
374
375    Attributes:
376        sparql (SPARQLWrapper): SPARQL endpoint wrapper instance for TGN queries
377        lang (str): Language code for the place type (default: "en")
378
379    Example:
380        >>> tgn = TGNQuery("http://vocab.getty.edu/sparql")
381        >>> results = tgn.places_by_name("Madrid", "Spain", "ciudad")
382        >>> coordinates = tgn.get_best_match(results, "Madrid")
383    """
384    def __init__(self):
385        super().__init__(base_url=TGN_ENDPOINT)
386        self.sparql = SPARQLWrapper(self.base_url)
387        self.sparql.setReturnFormat(JSON)
388
389    @sleep_and_retry
390    @limits(calls=10, period=1)
391    def places_by_name(self, place_name: str, country_code: Optional[str], place_type: Optional[str] = None, lang: Optional[str] = "en") -> Union[dict, list]:
392        """
393        Search for places using the TGN SPARQL endpoint.
394        
395        Parameters:
396            place_name (str): Name of the place to search for
397            country_code (str): Country code or name
398            place_type (str): Optional type of place (e.g., 'ciudad', 'pueblo')
399        """
400
401        country_name = ""
402
403        if country_code:
404            country = pycountry.countries.get(alpha_2=country_code)
405            if country:
406                country_name = country.name
407            else:
408                country_name = country_code
409
410        type_filter = f'?p gvp:placeType [rdfs:label "{place_type}"@{lang}].' if place_type else ''
411
412        query = f"""
413            PREFIX skos: <http://www.w3.org/2004/02/skos/core#>
414            PREFIX luc: <http://www.ontotext.com/owlim/lucene#>
415            PREFIX gvp: <http://vocab.getty.edu/ontology#>
416            PREFIX xl: <http://www.w3.org/2008/05/skos-xl#>
417            PREFIX tgn: <http://vocab.getty.edu/tgn/>
418            PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
419
420            SELECT DISTINCT ?p ?pLab ?context WHERE {{
421                ?p skos:inScheme tgn:;
422                    luc:term "{place_name}";
423                    gvp:prefLabelGVP [xl:literalForm ?pLab];
424                    gvp:parentString ?context.
425
426                {type_filter}
427                
428                FILTER(CONTAINS(?context, "{country_name}"))
429            }}
430        """
431        
432        self.logger.debug(f"Executing SPARQL {query} for TGN with place name '{place_name}' and country code '{country_code}'")
433
434        try:
435            self.sparql.setQuery(query)
436            results = self.sparql.query().convert()
437            self.logger.debug(f"SPARQL query results for '{place_name}': {results}")
438
439            if isinstance(results, dict) and "results" in results and "bindings" in results["results"]:
440                return results["results"]["bindings"]
441            else:
442                self.logger.error(f"Unexpected SPARQL result format for '{place_name}': {results}")
443                return []
444        except Exception as e:
445            self.logger.error(f"Error querying TGN for '{place_name}': {str(e)}")
446            return []
447
448    def get_coordinates_lod_json(self, data: dict) -> dict:
449        
450        for item in data.get("identified_by", []):
451            if item.get("type") == "crm:E47_Spatial_Coordinates":
452                coords = ast.literal_eval(item.get("value"))
453                if isinstance(coords, list) and len(coords) == 2:
454                    lon, lat = coords
455                    return {"latitude": lat, "longitude": lon}
456        return {"latitude": None, "longitude": None}
457        
458    def _post_filtering(
459        self,
460        tgn_uri: str,
461        place_name: str,
462        fuzzy_threshold: float,
463        confidence: float,
464        lang: Optional[str] = "en") -> dict:
465
466        json_url = tgn_uri + ".json"
467        try:
468            response = self._limited_get(json_url)
469            results = response.json()
470        except Exception as e:
471            self.logger.error(f"Error fetching TGN data for {place_name}: {e}")
472            return {}
473
474        coordinates = self.get_coordinates_lod_json(results)
475        if coordinates["latitude"] is None or coordinates["longitude"] is None:
476            self.logger.warning(f"No valid coordinates found for {place_name} in TGN results.")
477
478
479        return {
480                "place": place_name,
481                "standardize_label": results.get("_label", ""),
482                "language": lang,
483                "latitude": float(coordinates["latitude"]),
484                "longitude": float(coordinates["longitude"]),
485                "source": "TGN",
486                "id": results.get("id", ""),
487                "uri": results.get("id", ""),
488                "country_code": "",
489                "part_of": results.get("part_of", [{}])[0].get("_label", ""),
490                "part_of_uri": results.get("part_of", [{}])[0].get("id", ""),
491                "confidence": confidence,
492                "threshold": fuzzy_threshold,
493                "match_type": "exact" if confidence == 100 else "fuzzy"
494            }
495
496    def get_best_match(self, results: Union[dict, list], place_name: str, fuzzy_threshold: float, lang: Optional[str] = "en") -> Union[dict, None]:
497        if not results:
498            self.logger.debug(f"No results found for '{place_name}' in TGN.")
499            return None
500
501        self.logger.debug(f"Finding best match for '{place_name}' in TGN {results}")
502
503        if len(results) == 1:
504            return self._post_filtering(results[0].get("p", {}).get("value", ""),
505                                        place_name=place_name,
506                                        fuzzy_threshold=fuzzy_threshold,
507                                        confidence=100,
508                                        lang=lang)
509
510        for r in results:
511            label = r.get("pLab", {}).get("value", "")
512            uri = r.get("p", {}).get("value", "")
513            ratio = fuzz.ratio(label.lower(), place_name.lower())
514            self.logger.info(f"Comparing '{label}' with '{place_name} {uri}': {ratio}% similarity")
515            if ratio >= fuzzy_threshold:
516                self.logger.info(f"Best match for '{place_name}': {label} ({ratio}%)")
517                return self._post_filtering(uri,
518                                            place_name=place_name,
519                                            fuzzy_threshold=fuzzy_threshold,
520                                            confidence=ratio,
521                                            lang=lang)
522
523        self.logger.debug(f"No suitable match found for '{place_name}' in TGN.")
524        return None

A class to interact with the Getty Thesaurus of Geographic Names (TGN) SPARQL endpoint.

This class provides methods to search and retrieve geographic coordinates for places using the Getty TGN linked open data service. It supports fuzzy matching of place names and filtering by country and place type.

Attributes: sparql (SPARQLWrapper): SPARQL endpoint wrapper instance for TGN queries lang (str): Language code for the place type (default: "en")

Example:

tgn = TGNQuery("http://vocab.getty.edu/sparql") results = tgn.places_by_name("Madrid", "Spain", "ciudad") coordinates = tgn.get_best_match(results, "Madrid")

sparql
@sleep_and_retry
@limits(calls=10, period=1)
def places_by_name( self, place_name: str, country_code: Optional[str], place_type: Optional[str] = None, lang: Optional[str] = 'en') -> Union[dict, list]:
389    @sleep_and_retry
390    @limits(calls=10, period=1)
391    def places_by_name(self, place_name: str, country_code: Optional[str], place_type: Optional[str] = None, lang: Optional[str] = "en") -> Union[dict, list]:
392        """
393        Search for places using the TGN SPARQL endpoint.
394        
395        Parameters:
396            place_name (str): Name of the place to search for
397            country_code (str): Country code or name
398            place_type (str): Optional type of place (e.g., 'ciudad', 'pueblo')
399        """
400
401        country_name = ""
402
403        if country_code:
404            country = pycountry.countries.get(alpha_2=country_code)
405            if country:
406                country_name = country.name
407            else:
408                country_name = country_code
409
410        type_filter = f'?p gvp:placeType [rdfs:label "{place_type}"@{lang}].' if place_type else ''
411
412        query = f"""
413            PREFIX skos: <http://www.w3.org/2004/02/skos/core#>
414            PREFIX luc: <http://www.ontotext.com/owlim/lucene#>
415            PREFIX gvp: <http://vocab.getty.edu/ontology#>
416            PREFIX xl: <http://www.w3.org/2008/05/skos-xl#>
417            PREFIX tgn: <http://vocab.getty.edu/tgn/>
418            PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
419
420            SELECT DISTINCT ?p ?pLab ?context WHERE {{
421                ?p skos:inScheme tgn:;
422                    luc:term "{place_name}";
423                    gvp:prefLabelGVP [xl:literalForm ?pLab];
424                    gvp:parentString ?context.
425
426                {type_filter}
427                
428                FILTER(CONTAINS(?context, "{country_name}"))
429            }}
430        """
431        
432        self.logger.debug(f"Executing SPARQL {query} for TGN with place name '{place_name}' and country code '{country_code}'")
433
434        try:
435            self.sparql.setQuery(query)
436            results = self.sparql.query().convert()
437            self.logger.debug(f"SPARQL query results for '{place_name}': {results}")
438
439            if isinstance(results, dict) and "results" in results and "bindings" in results["results"]:
440                return results["results"]["bindings"]
441            else:
442                self.logger.error(f"Unexpected SPARQL result format for '{place_name}': {results}")
443                return []
444        except Exception as e:
445            self.logger.error(f"Error querying TGN for '{place_name}': {str(e)}")
446            return []

Search for places using the TGN SPARQL endpoint.

Parameters: place_name (str): Name of the place to search for country_code (str): Country code or name place_type (str): Optional type of place (e.g., 'ciudad', 'pueblo')

def get_coordinates_lod_json(self, data: dict) -> dict:
448    def get_coordinates_lod_json(self, data: dict) -> dict:
449        
450        for item in data.get("identified_by", []):
451            if item.get("type") == "crm:E47_Spatial_Coordinates":
452                coords = ast.literal_eval(item.get("value"))
453                if isinstance(coords, list) and len(coords) == 2:
454                    lon, lat = coords
455                    return {"latitude": lat, "longitude": lon}
456        return {"latitude": None, "longitude": None}
def get_best_match( self, results: Union[dict, list], place_name: str, fuzzy_threshold: float, lang: Optional[str] = 'en') -> Optional[dict]:
496    def get_best_match(self, results: Union[dict, list], place_name: str, fuzzy_threshold: float, lang: Optional[str] = "en") -> Union[dict, None]:
497        if not results:
498            self.logger.debug(f"No results found for '{place_name}' in TGN.")
499            return None
500
501        self.logger.debug(f"Finding best match for '{place_name}' in TGN {results}")
502
503        if len(results) == 1:
504            return self._post_filtering(results[0].get("p", {}).get("value", ""),
505                                        place_name=place_name,
506                                        fuzzy_threshold=fuzzy_threshold,
507                                        confidence=100,
508                                        lang=lang)
509
510        for r in results:
511            label = r.get("pLab", {}).get("value", "")
512            uri = r.get("p", {}).get("value", "")
513            ratio = fuzz.ratio(label.lower(), place_name.lower())
514            self.logger.info(f"Comparing '{label}' with '{place_name} {uri}': {ratio}% similarity")
515            if ratio >= fuzzy_threshold:
516                self.logger.info(f"Best match for '{place_name}': {label} ({ratio}%)")
517                return self._post_filtering(uri,
518                                            place_name=place_name,
519                                            fuzzy_threshold=fuzzy_threshold,
520                                            confidence=ratio,
521                                            lang=lang)
522
523        self.logger.debug(f"No suitable match found for '{place_name}' in TGN.")
524        return None

Get the best matching place from the results based on name similarity.

Parameters: results (Union[dict, list]): Results from places_by_name query place_name (str): Original place name to match against fuzzy_threshold (float): Minimum similarity score (0-100) for a match lang (Optional[str]): Language code for place type

Returns: dictionary: A dictionary containing { "place": place_name, "standardize_label": str, "latitude": float, "longitude": float, "source": str, "id": str, "uri": str, "country_code": str, "confidence": float, "threshold": fuzzy_threshold, "match_type": str }

class WikidataQuery(georesolver.base.BaseQuery):
527class WikidataQuery(BaseQuery):
528    """
529    A class to interact with the Wikidata MediaWiki API for geographic coordinates lookup.
530    """
531
532    def __init__(self,
533                 search_endpoint=WIKIDATA_ENDPOINT,
534                 entitydata_endpoint=ENTITYDATA_ENDPOINT):
535        super().__init__(base_url=search_endpoint)
536        self.search_endpoint = search_endpoint
537        self.entitydata_endpoint = entitydata_endpoint
538
539    @sleep_and_retry
540    @limits(calls=30, period=1)
541    def places_by_name(self, 
542                       place_name: str, 
543                       country_code: Optional[str], 
544                       place_type: Optional[str] = None,
545                       lang: Optional[str] = "en") -> Union[dict, list]:
546        
547        params = {
548            "action": "wbsearchentities",
549            "search": place_name,
550            "language": lang,
551            "format": "json",
552            "type": "item",
553            "limit": 10
554        }
555
556        try:
557            response = self._limited_get(self.search_endpoint, params=params)
558            search_results = response.json().get("search", [])
559            self.logger.debug(f"Wikidata search results for '{place_name}': {search_results}")
560        except Exception as e:
561            self.logger.error(f"Error querying Wikidata for '{place_name}': {e}")
562            return []
563
564        if not search_results:
565            return []
566
567        qids = [result.get("id") for result in search_results if result.get("id")]
568        self.logger.debug(f"Found {len(qids)} QIDs for '{place_name}': {qids}")
569        if not qids:
570            return []
571
572        # Batch fetch entity data for all QIDs
573        entities_data = self._batch_fetch_entities(qids)
574        
575        # Extract all country QIDs and administrative entity QIDs to batch fetch them too
576        country_qids = set()
577        admin_qids = set()
578        for qid in entities_data:
579            claims = entities_data[qid].get("claims", {})
580            try:
581                country_qid = claims.get("P17", [])[0]["mainsnak"]["datavalue"]["value"]["id"]
582                country_qids.add(country_qid)
583            except (IndexError, KeyError):
584                pass
585            try:
586                admin_qid = claims.get("P131", [])[0]["mainsnak"]["datavalue"]["value"]["id"]
587                admin_qids.add(admin_qid)
588            except (IndexError, KeyError):
589                pass
590        
591        # Batch fetch country and administrative entity data
592        country_data = {}
593        admin_data = {}
594        if country_qids:
595            country_data = self._batch_fetch_entities(list(country_qids))
596        if admin_qids:
597            admin_data = self._batch_fetch_entities(list(admin_qids))
598           
599        enriched_results = []
600        for result in search_results:
601            qid = result.get("id")
602            label = result.get("label", "")
603            
604            if qid not in entities_data:
605                continue
606                
607            entity_data = entities_data[qid]
608            claims = entity_data.get("claims", {})
609            
610            coords = self._extract_coordinates(claims)
611            if not coords or coords == (None, None):
612                continue
613
614            # Get country info for this place
615            place_country_qid, place_country_iso = self._get_place_country_info(claims, country_data)
616
617            # Get administrative entity info for this place
618            admin_qid, admin_label = self._get_place_admin_info(claims, admin_data, lang)
619
620            if country_code and not self._match_country_optimized(place_country_iso, country_code):
621                continue
622
623            if place_type and not self._match_place_type(claims, place_type):
624                continue
625
626            # Store all needed data for post-filtering
627            enriched_results.append({
628                "label": label,
629                "qid": qid,
630                "coordinates": coords,
631                "entity_data": entity_data,
632                "claims": claims,
633                "country_qid": place_country_qid,
634                "country_iso": place_country_iso,
635                "admin_qid": admin_qid,
636                "admin_label": admin_label
637            })
638
639        return enriched_results
640
641    def _batch_fetch_entities(self, qids: List[str]) -> Dict[str, dict]:
642        """
643        Batch fetch entity data for multiple QIDs using wbgetentities API.
644        This significantly reduces the number of HTTP requests compared to individual fetches.
645        """
646        entities_data = {}
647        
648        # Process QIDs in chunks of 50 (Wikidata API limit)
649        chunk_size = 50
650        for i in range(0, len(qids), chunk_size):
651            chunk = qids[i:i + chunk_size]
652            
653            params = {
654                "action": "wbgetentities",
655                "ids": "|".join(chunk),
656                "format": "json",
657                "props": "labels|claims"  # Only fetch what we need
658            }
659            
660            try:
661                response = self._limited_get(self.search_endpoint, params=params)
662                result = response.json()
663                
664                if "entities" in result:
665                    entities_data.update(result["entities"])
666                    
667            except Exception as e:
668                self.logger.warning(f"Failed to batch fetch entities {chunk}: {e}")
669                # Fallback to individual fetching for this chunk
670                for qid in chunk:
671                    entity_data = self._fetch_entity_data(qid)
672                    if entity_data:
673                        entities_data[qid] = entity_data
674        
675        return entities_data
676
677    def _get_place_country_info(self, claims: dict, country_data: Dict[str, dict]) -> tuple:
678        """
679        Extract country QID and ISO code for a place using pre-fetched country data.
680        Returns (country_qid, country_iso_code)
681        """
682        try:
683            country_qid = claims.get("P17", [])[0]["mainsnak"]["datavalue"]["value"]["id"]
684            if country_qid in country_data:
685                country_claims = country_data[country_qid].get("claims", {})
686                iso_code = country_claims.get("P297", [{}])[0].get("mainsnak", {}).get("datavalue", {}).get("value", "")
687                return country_qid, iso_code.upper() if iso_code else ""
688            return country_qid, ""
689        except (IndexError, KeyError):
690            return "", ""
691
692    def _get_place_admin_info(self, claims: dict, admin_data: Dict[str, dict], lang: Optional[str]) -> tuple:
693        """
694        Extract administrative entity QID and label for a place using pre-fetched admin data.
695        Returns (admin_qid, admin_label)
696        """
697        try:
698            admin_qid = claims.get("P131", [])[0]["mainsnak"]["datavalue"]["value"]["id"]
699            if admin_qid in admin_data and lang:
700                admin_labels = admin_data[admin_qid].get("labels", {})
701                admin_label = admin_labels.get(lang, {}).get("value", "")
702                return admin_qid, admin_label
703            return admin_qid, ""
704        except (IndexError, KeyError):
705            return "", ""
706
707    def _match_country_optimized(self, place_country_iso: str, target_country_code: str) -> bool:
708        """
709        Optimized country matching using pre-extracted ISO codes.
710        """
711        if not place_country_iso or not target_country_code:
712            return False
713        return place_country_iso.upper() == target_country_code.upper()
714
715    def _fetch_entity_data(self, qid: str) -> dict:
716        try:
717            url = f"{self.entitydata_endpoint}{qid}.json"
718            response = self._limited_get(url)
719            return response.json()["entities"][qid]
720        except Exception as e:
721            self.logger.warning(f"Failed to fetch entity data for {qid}: {e}")
722            return {}
723
724    def get_best_match(self, 
725                       results: Union[dict, list], 
726                       place_name: str, 
727                       fuzzy_threshold: float,
728                       lang: Optional[str] = None) -> Union[dict, None]:
729        if not results:
730            return None
731
732        best_score = 0
733        best_result = None
734
735        for result in results:
736            label = result["label"]
737            score = max(fuzz.ratio(label.lower(), place_name.lower()),
738                        fuzz.partial_ratio(label.lower(), place_name.lower()))
739
740            if score > best_score and score >= fuzzy_threshold:
741                best_score = score
742                best_result = result
743                self.logger.info(f"Wikidata match: '{label}' → {score}%")
744
745        if best_result:
746            return self._post_filtering(
747                results=best_result,
748                place_name=place_name,
749                fuzzy_threshold=fuzzy_threshold,
750                confidence=best_score,
751                lang=lang
752            )
753        
754        return None
755
756    def _post_filtering(self,
757                       results: dict,
758                       place_name: str,
759                       fuzzy_threshold: float,
760                       confidence: float,
761                       lang: Optional[str] = "en") -> dict:
762        """
763        Returns the dictionary customized to the Wikidata API results.
764        """
765        qid = results.get("qid", "")
766        label = results.get("label", "")
767        coords = results.get("coordinates", (None, None))
768        entity_data = results.get("entity_data", {})
769        claims = results.get("claims", {})
770        
771        # Use pre-extracted country and administrative entity information
772        country_code = results.get("country_iso", "")
773        admin_qid = results.get("admin_qid", "")
774        admin_label = results.get("admin_label", "")
775        
776        # Build part_of_uri if we have admin_qid
777        part_of_uri = f"https://www.wikidata.org/entity/{admin_qid}" if admin_qid else ""
778
779        return {
780            "place": place_name,
781            "standardize_label": label,
782            "language": lang,
783            "latitude": float(coords[0]) if coords[0] is not None else None,
784            "longitude": float(coords[1]) if coords[1] is not None else None,
785            "source": "Wikidata",
786            "id": qid,
787            "uri": f"https://www.wikidata.org/entity/{qid}",
788            "country_code": country_code,
789            "part_of": admin_label,
790            "part_of_uri": part_of_uri,
791            "confidence": confidence,
792            "threshold": fuzzy_threshold,
793            "match_type": "exact" if confidence == 100 else "fuzzy"
794        }
795
796    def _extract_coordinates(self, claims: dict) -> tuple:
797        try:
798            coord_data = claims.get("P625", [])[0]["mainsnak"]["datavalue"]["value"]
799            return coord_data["latitude"], coord_data["longitude"]
800        except Exception:
801            return (None, None)
802
803    def _match_country(self, claims: dict, iso_code: str) -> bool:
804        # DEPRECATED: Use _match_country_optimized instead
805        # This method is kept for backward compatibility but should not be used
806        # in the optimized workflow as it makes individual HTTP requests
807        try:
808            country_entity = claims.get("P17", [])[0]["mainsnak"]["datavalue"]["value"]["id"]
809            url = f"{self.entitydata_endpoint}{country_entity}.json"
810            response = self._limited_get(url)
811            country_data = response.json()
812            wikidata_iso = country_data["entities"][country_entity]["claims"]["P297"][0]["mainsnak"]["datavalue"]["value"]
813            return wikidata_iso.upper() == iso_code.upper()
814        except Exception:
815            return False
816
817    def _match_place_type(self, claims: dict, expected_qid: str) -> bool:
818        try:
819            types = [c["mainsnak"]["datavalue"]["value"]["id"] for c in claims.get("P31", [])]
820            return expected_qid in types
821        except Exception:
822            return False

A class to interact with the Wikidata MediaWiki API for geographic coordinates lookup.

WikidataQuery( search_endpoint='https://www.wikidata.org/w/api.php', entitydata_endpoint='https://www.wikidata.org/wiki/Special:EntityData/')
532    def __init__(self,
533                 search_endpoint=WIKIDATA_ENDPOINT,
534                 entitydata_endpoint=ENTITYDATA_ENDPOINT):
535        super().__init__(base_url=search_endpoint)
536        self.search_endpoint = search_endpoint
537        self.entitydata_endpoint = entitydata_endpoint
search_endpoint
entitydata_endpoint
@sleep_and_retry
@limits(calls=30, period=1)
def places_by_name( self, place_name: str, country_code: Optional[str], place_type: Optional[str] = None, lang: Optional[str] = 'en') -> Union[dict, list]:
539    @sleep_and_retry
540    @limits(calls=30, period=1)
541    def places_by_name(self, 
542                       place_name: str, 
543                       country_code: Optional[str], 
544                       place_type: Optional[str] = None,
545                       lang: Optional[str] = "en") -> Union[dict, list]:
546        
547        params = {
548            "action": "wbsearchentities",
549            "search": place_name,
550            "language": lang,
551            "format": "json",
552            "type": "item",
553            "limit": 10
554        }
555
556        try:
557            response = self._limited_get(self.search_endpoint, params=params)
558            search_results = response.json().get("search", [])
559            self.logger.debug(f"Wikidata search results for '{place_name}': {search_results}")
560        except Exception as e:
561            self.logger.error(f"Error querying Wikidata for '{place_name}': {e}")
562            return []
563
564        if not search_results:
565            return []
566
567        qids = [result.get("id") for result in search_results if result.get("id")]
568        self.logger.debug(f"Found {len(qids)} QIDs for '{place_name}': {qids}")
569        if not qids:
570            return []
571
572        # Batch fetch entity data for all QIDs
573        entities_data = self._batch_fetch_entities(qids)
574        
575        # Extract all country QIDs and administrative entity QIDs to batch fetch them too
576        country_qids = set()
577        admin_qids = set()
578        for qid in entities_data:
579            claims = entities_data[qid].get("claims", {})
580            try:
581                country_qid = claims.get("P17", [])[0]["mainsnak"]["datavalue"]["value"]["id"]
582                country_qids.add(country_qid)
583            except (IndexError, KeyError):
584                pass
585            try:
586                admin_qid = claims.get("P131", [])[0]["mainsnak"]["datavalue"]["value"]["id"]
587                admin_qids.add(admin_qid)
588            except (IndexError, KeyError):
589                pass
590        
591        # Batch fetch country and administrative entity data
592        country_data = {}
593        admin_data = {}
594        if country_qids:
595            country_data = self._batch_fetch_entities(list(country_qids))
596        if admin_qids:
597            admin_data = self._batch_fetch_entities(list(admin_qids))
598           
599        enriched_results = []
600        for result in search_results:
601            qid = result.get("id")
602            label = result.get("label", "")
603            
604            if qid not in entities_data:
605                continue
606                
607            entity_data = entities_data[qid]
608            claims = entity_data.get("claims", {})
609            
610            coords = self._extract_coordinates(claims)
611            if not coords or coords == (None, None):
612                continue
613
614            # Get country info for this place
615            place_country_qid, place_country_iso = self._get_place_country_info(claims, country_data)
616
617            # Get administrative entity info for this place
618            admin_qid, admin_label = self._get_place_admin_info(claims, admin_data, lang)
619
620            if country_code and not self._match_country_optimized(place_country_iso, country_code):
621                continue
622
623            if place_type and not self._match_place_type(claims, place_type):
624                continue
625
626            # Store all needed data for post-filtering
627            enriched_results.append({
628                "label": label,
629                "qid": qid,
630                "coordinates": coords,
631                "entity_data": entity_data,
632                "claims": claims,
633                "country_qid": place_country_qid,
634                "country_iso": place_country_iso,
635                "admin_qid": admin_qid,
636                "admin_label": admin_label
637            })
638
639        return enriched_results

Search for places by name. Must be implemented by subclasses.

Parameters: place_name (str): Name of the place to search for country_code (Optional[str]): ISO 3166-1 alpha-2 country code place_type (Optional[str]): Optional place type filter lang (Optional[str]): Language code for place type

Returns: Union[dict, list]: Search results in service-specific format

def get_best_match( self, results: Union[dict, list], place_name: str, fuzzy_threshold: float, lang: Optional[str] = None) -> Optional[dict]:
724    def get_best_match(self, 
725                       results: Union[dict, list], 
726                       place_name: str, 
727                       fuzzy_threshold: float,
728                       lang: Optional[str] = None) -> Union[dict, None]:
729        if not results:
730            return None
731
732        best_score = 0
733        best_result = None
734
735        for result in results:
736            label = result["label"]
737            score = max(fuzz.ratio(label.lower(), place_name.lower()),
738                        fuzz.partial_ratio(label.lower(), place_name.lower()))
739
740            if score > best_score and score >= fuzzy_threshold:
741                best_score = score
742                best_result = result
743                self.logger.info(f"Wikidata match: '{label}' → {score}%")
744
745        if best_result:
746            return self._post_filtering(
747                results=best_result,
748                place_name=place_name,
749                fuzzy_threshold=fuzzy_threshold,
750                confidence=best_score,
751                lang=lang
752            )
753        
754        return None

Get the best matching place from the results based on name similarity.

Parameters: results (Union[dict, list]): Results from places_by_name query place_name (str): Original place name to match against fuzzy_threshold (float): Minimum similarity score (0-100) for a match lang (Optional[str]): Language code for place type

Returns: dictionary: A dictionary containing { "place": place_name, "standardize_label": str, "latitude": float, "longitude": float, "source": str, "id": str, "uri": str, "country_code": str, "confidence": float, "threshold": fuzzy_threshold, "match_type": str }