Source code for gplay_scraper.core.gplay_scraper

import json
import re
import logging
from typing import Dict
from ..utils.http_client import HttpClient
from ..config import Config
from ..exceptions import DataParsingError, InvalidAppIdError
from urllib.parse import quote

logger = logging.getLogger(__name__)


[docs] class AppScraper: """Scraper for fetching app details from Google Play Store."""
[docs] def __init__(self, rate_limit_delay: float = None, http_client: str = None): """Initialize AppScraper with HTTP client. Args: rate_limit_delay: Delay between requests http_client: HTTP client name """ self.http_client = HttpClient(rate_limit_delay, http_client)
[docs] def fetch_playstore_page(self, app_id: str, lang: str = Config.DEFAULT_LANGUAGE, country: str = Config.DEFAULT_COUNTRY) -> str: """Fetch app page HTML from Google Play Store. Args: app_id: Google Play app ID lang: Language code country: Country code Returns: HTML content of app page """ return self.http_client.fetch_app_page(app_id, lang, country)
[docs] def scrape_play_store_data(self, app_id: str, lang: str = Config.DEFAULT_LANGUAGE, country: str = Config.DEFAULT_COUNTRY) -> Dict: """Extract dataset from app page HTML. Args: app_id: Google Play app ID lang: Language code country: Country code Returns: Dictionary containing ds:5 dataset Raises: DataParsingError: If dataset not found """ html_content = self.fetch_playstore_page(app_id, lang, country) ds_match = re.search(r'AF_initDataCallback\s*\(\s*({\s*key:\s*["\']ds:5["\'][\s\S]*?})\s*\)\s*;', html_content, re.DOTALL) if ds_match: ds5_data = ds_match.group(1) else: all_callbacks = re.findall(r'AF_initDataCallback\s*\(\s*({[\s\S]*?})\s*\)\s*;', html_content, re.DOTALL) ds5_data = "" for callback in all_callbacks: if "'ds:5'" in callback or '"ds:5"' in callback: ds5_data = callback break if not ds5_data: raise DataParsingError(Config.ERROR_MESSAGES["DS5_NOT_FOUND"]) return {"ds:5": ds5_data}
[docs] class SearchScraper: """Scraper for fetching search results from Google Play Store."""
[docs] def __init__(self, rate_limit_delay: float = None, http_client: str = None): """Initialize SearchScraper with HTTP client.""" self.http_client = HttpClient(rate_limit_delay, http_client)
[docs] def scrape_play_store_data(self, query: str, count: int = Config.DEFAULT_SEARCH_COUNT, lang: str = Config.DEFAULT_LANGUAGE, country: str = Config.DEFAULT_COUNTRY) -> Dict: """Scrape search results with automatic pagination support. Args: query: Search query string count: Total number of results to fetch lang: Language code country: Country code Returns: Dictionary containing all search results Raises: DataParsingError: If parsing fails """ # Get initial search page html_content = self.fetch_playstore_search(query, count, lang, country) # Use parser for all parsing operations from .gplay_parser import SearchParser parser = SearchParser() dataset = parser.parse_html_content(html_content) # If count <= 20, return initial results if count <= 20: return dataset # Extract pagination token token = parser.extract_pagination_token(dataset) # Collect all results all_results = [] initial_results = self._get_nested_value(dataset.get("ds:1", []), [0, 1, 0, 0, 0], []) all_results.extend(initial_results) # Paginate if needed and token exists while len(all_results) < count and token: needed = min(100, count - len(all_results)) try: response_text = self.http_client.fetch_search_page(token=token, needed=needed, lang=lang, country=country) data = json.loads(response_text[5:]) parsed_data = json.loads(data[0][2]) if parsed_data: paginated_results = self._get_nested_value(parsed_data, [0, 0, 0], []) all_results.extend(paginated_results) # Update token for next iteration token = self._get_nested_value(parsed_data, [0, 0, 7, 1]) else: break except (json.JSONDecodeError, IndexError, KeyError, Exception): break # Update dataset with all results if "ds:1" in dataset: dataset["ds:1"][0][1][0][0][0] = all_results[:count] return dataset
def _get_nested_value(self, data, path, default=None): """Safely get nested value from data structure. Args: data: Data structure to traverse path: List of keys/indices to follow default: Default value if path not found Returns: Value at path or default """ try: for key in path: data = data[key] return data except (KeyError, IndexError, TypeError): return default
[docs] class ReviewsScraper: """Scraper for fetching user reviews from Google Play Store.""" # Sort order mapping SORT_NAMES = { 'RELEVANT': 1, # Most relevant reviews 'NEWEST': 2, # Newest reviews first 'RATING': 3 # Sorted by rating }
[docs] def __init__(self, rate_limit_delay: float = None, http_client: str = None): """Initialize ReviewsScraper with HTTP client.""" self.http_client = HttpClient(rate_limit_delay, http_client)
[docs] def fetch_reviews_batch(self, app_id: str, lang: str = Config.DEFAULT_LANGUAGE, country: str = Config.DEFAULT_COUNTRY, sort: int = Config.DEFAULT_REVIEWS_SORT, batch_count: int = Config.DEFAULT_REVIEWS_BATCH_SIZE, token: str = None) -> str: """Fetch single batch of reviews from API. Args: app_id: Google Play app ID lang: Language code country: Country code sort: Sort order (NEWEST, RELEVANT, RATING) batch_count: Number of reviews per batch token: Pagination token for next batch Returns: Raw API response content """ sort_value = self.SORT_NAMES.get(sort, sort) if isinstance(sort, str) else sort return self.http_client.fetch_reviews_batch(app_id, lang, country, sort_value, batch_count, token)
[docs] def scrape_reviews_data(self, app_id: str, count: int = Config.DEFAULT_REVIEWS_COUNT, lang: str = Config.DEFAULT_LANGUAGE, country: str = Config.DEFAULT_COUNTRY, sort: int = Config.DEFAULT_REVIEWS_SORT) -> Dict: """Scrape multiple batches of reviews. Args: app_id: Google Play app ID count: Total number of reviews to fetch lang: Language code country: Country code sort: Sort order Returns: Dictionary containing all review responses """ all_responses = [] token = None batch_size = Config.DEFAULT_REVIEWS_BATCH_SIZE while len(all_responses) * batch_size < count: remaining = count - (len(all_responses) * batch_size) fetch_count = min(batch_size, remaining) response = self.fetch_reviews_batch(app_id, lang, country, sort, fetch_count, token) if not response: break all_responses.append(response) try: regex = re.compile(r"\)]}'\n\n([\s\S]+)") matches = regex.findall(response) if matches: data = json.loads(matches[0]) token = json.loads(data[0][2])[-2][-1] if not token or isinstance(token, list): break except (json.JSONDecodeError, IndexError, KeyError): break return {"reviews": all_responses}
[docs] class DeveloperScraper: """Scraper for fetching developer portfolio from Google Play Store."""
[docs] def __init__(self, rate_limit_delay: float = None, http_client: str = None): """Initialize DeveloperScraper with HTTP client.""" self.http_client = HttpClient(rate_limit_delay, http_client)
[docs] def fetch_developer_page(self, dev_id: str, lang: str = Config.DEFAULT_LANGUAGE, country: str = Config.DEFAULT_COUNTRY) -> str: """Fetch developer page HTML from Google Play Store. Args: dev_id: Developer ID (numeric or string) lang: Language code country: Country code Returns: HTML content of developer page """ return self.http_client.fetch_developer_page(dev_id, lang, country)
[docs] def scrape_play_store_data(self, dev_id: str, lang: str = Config.DEFAULT_LANGUAGE, country: str = Config.DEFAULT_COUNTRY) -> Dict: """Extract dataset from developer page HTML. Args: dev_id: Developer ID lang: Language code country: Country code Returns: Dictionary containing ds:3 dataset and dev_id Raises: DataParsingError: If dataset not found """ html_content = self.fetch_developer_page(dev_id, lang, country) ds_match = re.search(r'AF_initDataCallback\s*\(\s*({\s*key:\s*["\']ds:3["\'][\s\S]*?})\s*\)\s*;', html_content, re.DOTALL) if ds_match: ds3_data = ds_match.group(1) else: all_callbacks = re.findall(r'AF_initDataCallback\s*\(\s*({[\s\S]*?})\s*\)\s*;', html_content, re.DOTALL) ds3_data = "" for callback in all_callbacks: if "'ds:3'" in callback or '"ds:3"' in callback: ds3_data = callback break if not ds3_data: raise DataParsingError(Config.ERROR_MESSAGES["DS3_NOT_FOUND"]) return {"ds:3": ds3_data, "dev_id": dev_id}
[docs] class SimilarScraper: """Scraper for fetching similar apps from Google Play Store."""
[docs] def __init__(self, rate_limit_delay: float = None, http_client: str = None): """Initialize SimilarScraper with HTTP client.""" self.http_client = HttpClient(rate_limit_delay, http_client)
[docs] def fetch_similar_page(self, app_id: str, lang: str = Config.DEFAULT_LANGUAGE, country: str = Config.DEFAULT_COUNTRY) -> str: """Fetch app page HTML to extract similar apps cluster URL. Args: app_id: Google Play app ID lang: Language code country: Country code Returns: HTML content of app page """ return self.http_client.fetch_app_page(app_id, lang, country)
[docs] def scrape_play_store_data(self, app_id: str, lang: str = Config.DEFAULT_LANGUAGE, country: str = Config.DEFAULT_COUNTRY) -> Dict: """Extract similar apps dataset from cluster page. Args: app_id: Google Play app ID lang: Language code country: Country code Returns: Dictionary containing ds:3 dataset Raises: DataParsingError: If dataset not found """ html_content = self.fetch_similar_page(app_id, lang, country) # Extract cluster URL from app page pattern1 = r'&quot;(/store/apps/collection/cluster\?gsr=[^&]+)&quot;' matches1 = re.findall(pattern1, html_content) pattern2 = r'"(/store/apps/collection/cluster\?gsr=[^"]+)"' matches2 = re.findall(pattern2, html_content) all_matches = list(set(matches1 + matches2)) if not all_matches: return {"ds:3": None} cluster_url = all_matches[0].replace('&amp;', '&') cluster_html = self.http_client.fetch_cluster_page(cluster_url, lang, country) ds_match = re.search(r'AF_initDataCallback\s*\(\s*({\s*key:\s*["\']ds:3["\'][\s\S]*?})\s*\)\s*;', cluster_html, re.DOTALL) if ds_match: ds3_data = ds_match.group(1) else: all_callbacks = re.findall(r'AF_initDataCallback\s*\(\s*({[\s\S]*?})\s*\)\s*;', cluster_html, re.DOTALL) ds3_data = "" for callback in all_callbacks: if "'ds:3'" in callback or '"ds:3"' in callback: ds3_data = callback break if not ds3_data: raise DataParsingError(Config.ERROR_MESSAGES["DS3_NOT_FOUND"]) return {"ds:3": ds3_data}
[docs] class ListScraper: """Scraper for fetching top charts from Google Play Store.""" # Collection name mapping CLUSTER_NAMES = { 'TOP_FREE': 'topselling_free', # Top free apps 'TOP_PAID': 'topselling_paid', # Top paid apps 'TOP_GROSSING': 'topgrossing' # Top grossing apps }
[docs] def __init__(self, rate_limit_delay: float = None, http_client: str = None): """Initialize ListScraper with HTTP client.""" self.http_client = HttpClient(rate_limit_delay, http_client)
[docs] def scrape_play_store_data(self, collection: str, category: str = Config.DEFAULT_LIST_CATEGORY, count: int = Config.DEFAULT_LIST_COUNT, lang: str = Config.DEFAULT_LANGUAGE, country: str = Config.DEFAULT_COUNTRY) -> Dict: """Scrape top charts data from Google Play Store. Args: collection: Collection type (TOP_FREE, TOP_PAID, TOP_GROSSING) category: App category (e.g., GAME, SOCIAL) count: Number of apps to fetch lang: Language code country: Country code Returns: Dictionary containing collection data Raises: DataParsingError: If JSON parsing fails """ cluster = self.CLUSTER_NAMES.get(collection, collection) response_text = self.http_client.fetch_list_page(cluster, category, count, lang, country) try: lines = response_text.strip().split('\n') data = json.loads(lines[2]) collection_data = json.loads(data[0][2]) return {"collection_data": collection_data} except (json.JSONDecodeError, IndexError, KeyError) as e: raise DataParsingError(Config.ERROR_MESSAGES["JSON_PARSE_FAILED"].format(error=str(e)))
[docs] class SuggestScraper: """Scraper for fetching search suggestions from Google Play Store."""
[docs] def __init__(self, rate_limit_delay: float = None, http_client: str = None): """Initialize SuggestScraper with HTTP client.""" self.http_client = HttpClient(rate_limit_delay, http_client)
[docs] def scrape_suggestions(self, term: str, lang: str = Config.DEFAULT_LANGUAGE, country: str = Config.DEFAULT_COUNTRY) -> Dict: """Scrape search suggestions from Google Play Store. Args: term: Search term for suggestions lang: Language code country: Country code Returns: Dictionary containing list of suggestions Raises: DataParsingError: If JSON parsing fails """ if not term: return {"suggestions": []} response_text = self.http_client.fetch_suggest_page(term, lang, country) try: input_data = json.loads(response_text[5:]) data = json.loads(input_data[0][2]) if data is None: return {"suggestions": []} suggestions = [s[0] for s in data[0][0]] return {"suggestions": suggestions} except (json.JSONDecodeError, IndexError, KeyError, TypeError) as e: raise DataParsingError(Config.ERROR_MESSAGES["JSON_PARSE_FAILED"].format(error=str(e)))