Source code for gplay_scraper.core.gplay_parser
"""Parser classes for extracting and formatting data from raw responses.
This module contains 7 parser classes that handle JSON/HTML parsing and
data formatting for all scraping methods.
"""
import json
import re
from datetime import datetime, timezone
from typing import Dict, Any, List, Optional, Tuple
from ..models.element_specs import ElementSpecs, nested_lookup
from ..utils.helpers import clean_json_string, alternative_json_clean, calculate_app_age, calculate_daily_installs, calculate_monthly_installs
from ..config import Config
from ..exceptions import DataParsingError
[docs]
class AppParser:
"""Parser for extracting and formatting app data."""
[docs]
def parse_app_data(self, dataset: Dict, app_id: str) -> Dict[str, Any]:
"""Parse raw app data from dataset.
Args:
dataset: Raw dataset from scraper
app_id: Google Play app ID
Returns:
Dictionary with parsed app details
Raises:
DataParsingError: If parsing fails
"""
ds5_data = dataset.get("ds:5", "")
if not ds5_data:
raise DataParsingError(Config.ERROR_MESSAGES["NO_DS5_DATA"])
json_str_cleaned = clean_json_string(ds5_data)
try:
data = json.loads(json_str_cleaned)
except json.JSONDecodeError as e:
try:
alternative_cleaned = alternative_json_clean(ds5_data)
data = json.loads(alternative_cleaned)
except Exception:
raise DataParsingError(Config.ERROR_MESSAGES["JSON_PARSE_FAILED"].format(error=str(e)))
app_details = {}
for key, spec in ElementSpecs.App.items():
app_details[key] = spec.extract_content(data.get("data", data))
app_details['appId'] = app_id
app_details['url'] = f"{Config.PLAY_STORE_BASE_URL}{Config.APP_DETAILS_ENDPOINT}?id={app_id}"
current_date = datetime.now(timezone.utc)
release_date_str = app_details.get("released")
if release_date_str:
app_details["appAge"] = calculate_app_age(release_date_str, current_date)
app_details["dailyInstalls"] = calculate_daily_installs(app_details.get("installs"), release_date_str, current_date)
app_details["minDailyInstalls"] = calculate_daily_installs(app_details.get("minInstalls"), release_date_str, current_date)
app_details["realDailyInstalls"] = calculate_daily_installs(app_details.get("realInstalls"), release_date_str, current_date)
app_details["monthlyInstalls"] = calculate_monthly_installs(app_details.get("installs"), release_date_str, current_date)
app_details["minMonthlyInstalls"] = calculate_monthly_installs(app_details.get("minInstalls"), release_date_str, current_date)
app_details["realMonthlyInstalls"] = calculate_monthly_installs(app_details.get("realInstalls"), release_date_str, current_date)
else:
metric_keys = [
"appAge", "dailyInstalls", "minDailyInstalls", "realDailyInstalls",
"monthlyInstalls", "minMonthlyInstalls", "realMonthlyInstalls"
]
for key in metric_keys:
app_details[key] = None
return app_details
[docs]
def format_app_data(self, details: dict) -> dict:
"""Format parsed app data into final structure.
Args:
details: Parsed app details
Returns:
Formatted dictionary with all app fields
"""
return {
"appId": details.get("appId"),
"title": details.get("title"),
"summary": details.get("summary"),
"description": details.get("description"),
"genre": details.get("genre"),
"genreId": details.get("genreId"),
"categories": details.get("categories"),
"available": details.get("available"),
"released": details.get("released"),
"appAgeDays": details.get("appAge"),
"lastUpdated": details.get("lastUpdatedOn"),
"updatedTimestamp": details.get("updated"),
"icon": details.get("icon"),
"headerImage": details.get("headerImage"),
"screenshots": details.get("screenshots"),
"video": details.get("video"),
"videoImage": details.get("videoImage"),
"installs": details.get("installs"),
"minInstalls": details.get("minInstalls"),
"realInstalls": details.get("realInstalls"),
"dailyInstalls": details.get("dailyInstalls"),
"minDailyInstalls": details.get("minDailyInstalls"),
"realDailyInstalls": details.get("realDailyInstalls"),
"monthlyInstalls": details.get("monthlyInstalls"),
"minMonthlyInstalls": details.get("minMonthlyInstalls"),
"realMonthlyInstalls": details.get("realMonthlyInstalls"),
"score": details.get("score"),
"ratings": details.get("ratings"),
"reviews": details.get("reviews"),
"histogram": details.get("histogram"),
"adSupported": details.get("adSupported"),
"containsAds": details.get("containsAds"),
"version": details.get("version"),
"androidVersion": details.get("androidVersion"),
"maxAndroidApi": details.get("maxandroidapi"),
"minAndroidApi": details.get("minandroidapi"),
"appBundle": details.get("appBundle"),
"contentRating": details.get("contentRating"),
"contentRatingDescription": details.get("contentRatingDescription"),
"whatsNew": details.get("whatsNew"),
"permissions": details.get("permissions"),
"dataSafety": details.get("dataSafety"),
"price": details.get("price"),
"currency": details.get("currency"),
"free": details.get("free"),
"offersIAP": details.get("offersIAP"),
"inAppProductPrice": details.get("inAppProductPrice"),
"sale": details.get("sale"),
"originalPrice": details.get("originalPrice"),
"developer": details.get("developer"),
"developerId": details.get("developerId"),
"developerEmail": details.get("developerEmail"),
"developerWebsite": details.get("developerWebsite"),
"developerAddress": details.get("developerAddress"),
"developerPhone": details.get("developerPhone"),
"privacyPolicy": details.get("privacyPolicy"),
"appUrl": details.get("url"),
}
[docs]
class SearchParser:
"""Parser for extracting and formatting search results."""
[docs]
def parse_search_results(self, dataset: Dict, count: int) -> List[Dict]:
"""Parse search results from dataset.
Args:
dataset: Raw dataset from scraper
count: Maximum number of results to parse
Returns:
List of parsed search result dictionaries
"""
if "ds:1" not in dataset:
return []
search_data = nested_lookup(dataset.get("ds:1", {}), [0, 1, 0, 0, 0])
if not search_data:
return []
results = []
n_apps = min(len(search_data), count)
for i in range(n_apps):
app = self.extract_search_result(search_data[i])
if app:
results.append(app)
return results[:count]
[docs]
def extract_search_result(self, data) -> Dict:
"""Extract single search result from raw data.
Args:
data: Raw search result data
Returns:
Dictionary with extracted search result or None if extraction fails
"""
try:
result = {}
for key, spec in ElementSpecs.Search.items():
result[key] = spec.extract_content(data)
return result
except Exception:
return None
[docs]
def format_search_result(self, result: dict) -> dict:
"""Format parsed search result into final structure.
Args:
result: Parsed search result
Returns:
Formatted dictionary with search result fields
"""
return {
"appId": result.get("appId"),
"title": result.get("title"),
"description": result.get("summary"),
"icon": result.get("icon"),
"developer": result.get("developer"),
"score": result.get("score"),
"scoreText": result.get("scoreText"),
"currency": result.get("currency"),
"price": result.get("price"),
"free": result.get("free"),
"url": result.get("url"),
}
[docs]
def extract_pagination_token(self, dataset: Dict) -> str:
"""Extract pagination token from search dataset.
Args:
dataset: Search dataset
Returns:
Pagination token or None
"""
sections = nested_lookup(dataset.get("ds:1", {}), [0, 1, 0, 0])
if not sections:
return None
for section in sections:
if isinstance(section, list) and len(section) > 1:
potential_token = nested_lookup(section, [1])
if isinstance(potential_token, str):
return potential_token
return None
[docs]
def parse_html_content(self, html_content: str) -> Dict:
"""Extract datasets from search page HTML.
Args:
html_content: HTML content of search page
Returns:
Dictionary containing all datasets
Raises:
DataParsingError: If no datasets found
"""
import re
script_regex = re.compile(r"AF_initDataCallback[\s\S]*?</script")
key_regex = re.compile(r"(ds:.*?)'")
value_regex = re.compile(r"data:([\s\S]*?), sideChannel: \{\}\}\);</")
matches = script_regex.findall(html_content)
dataset = {}
for match in matches:
key_match = key_regex.findall(match)
value_match = value_regex.findall(match)
if key_match and value_match:
key = key_match[0]
try:
import json
value = json.loads(value_match[0])
dataset[key] = value
except json.JSONDecodeError:
continue
if not dataset:
from ..exceptions import DataParsingError
from ..config import Config
raise DataParsingError("No search data found in HTML")
return dataset
[docs]
class ReviewsParser:
"""Parser for extracting and formatting user reviews."""
[docs]
def parse_reviews_response(self, content: str) -> Tuple[List[Dict], Optional[str]]:
"""Parse reviews from API response content.
Args:
content: Raw API response content
Returns:
Tuple of (list of review dictionaries, next page token)
"""
regex = re.compile(r"\)]}'\n\n([\s\S]+)")
matches = regex.findall(content)
if not matches:
return [], None
try:
data = json.loads(matches[0])
reviews_data = json.loads(data[0][2])
next_token = None
try:
next_token = json.loads(data[0][2])[-2][-1]
except:
pass
if not reviews_data or len(reviews_data) == 0 or len(reviews_data[0]) == 0:
return [], None
reviews = []
for review_raw in reviews_data[0]:
review = self.extract_review_data(review_raw)
if review:
reviews.append(review)
return reviews, next_token
except (json.JSONDecodeError, IndexError, KeyError):
return [], None
[docs]
def extract_review_data(self, review_raw) -> Optional[Dict]:
"""Extract single review from raw data.
Args:
review_raw: Raw review data array
Returns:
Dictionary with extracted review data or None if extraction fails
"""
try:
review = {
"reviewId": review_raw[0] if len(review_raw) > 0 else None,
"userName": review_raw[1][0] if len(review_raw) > 1 and review_raw[1] else None,
"userImage": None,
"content": review_raw[4] if len(review_raw) > 4 else None,
"score": review_raw[2] if len(review_raw) > 2 else None,
"thumbsUpCount": review_raw[6] if len(review_raw) > 6 else None,
"at": datetime.fromtimestamp(review_raw[5][0]).isoformat() if len(review_raw) > 5 and review_raw[5] else None,
"appVersion": review_raw[10] if len(review_raw) > 10 else None,
}
try:
if len(review_raw) > 1 and review_raw[1] and len(review_raw[1]) > 1 and review_raw[1][1]:
review["userImage"] = review_raw[1][1][3][2]
except:
pass
return review
except Exception:
return None
[docs]
def parse_multiple_responses(self, dataset: Dict) -> List[Dict]:
"""Parse multiple review responses.
Args:
dataset: Dataset containing multiple review responses
Returns:
List of all parsed reviews
"""
responses = dataset.get("reviews", [])
all_reviews = []
for response in responses:
reviews, _ = self.parse_reviews_response(response)
all_reviews.extend(reviews)
return all_reviews
[docs]
def format_reviews_data(self, reviews_data: List[Dict]) -> List[Dict]:
"""Format parsed reviews into final structure.
Args:
reviews_data: List of parsed reviews
Returns:
List of formatted review dictionaries
"""
formatted_reviews = []
for review in reviews_data:
formatted_review = {
"reviewId": review.get("reviewId"),
"userName": review.get("userName"),
"userImage": review.get("userImage"),
"score": review.get("score"),
"content": review.get("content"),
"thumbsUpCount": review.get("thumbsUpCount"),
"appVersion": review.get("appVersion"),
"at": review.get("at"),
}
formatted_reviews.append(formatted_review)
return formatted_reviews
[docs]
class DeveloperParser:
"""Parser for extracting and formatting developer apps."""
[docs]
def parse_developer_data(self, dataset: Dict, dev_id: str) -> List[Dict]:
"""Parse developer apps from dataset.
Args:
dataset: Raw dataset from scraper
dev_id: Developer ID (numeric or string)
Returns:
List of parsed app dictionaries
Raises:
DataParsingError: If parsing fails
"""
ds3_data = dataset.get("ds:3", "")
if not ds3_data:
raise DataParsingError(Config.ERROR_MESSAGES["NO_DS3_DATA"])
from ..utils.helpers import clean_json_string, alternative_json_clean
json_str_cleaned = clean_json_string(ds3_data)
try:
data = json.loads(json_str_cleaned)
except json.JSONDecodeError as e:
try:
alternative_cleaned = alternative_json_clean(ds3_data)
data = json.loads(alternative_cleaned)
except Exception:
raise DataParsingError(Config.ERROR_MESSAGES["DS3_JSON_PARSE_FAILED"].format(error=str(e)))
# Navigate to apps array based on dev_id type
is_numeric = dev_id.isdigit()
if is_numeric:
apps_path = [0, 1, 0, 21, 0]
else:
apps_path = [0, 1, 0, 22, 0]
apps_data = nested_lookup(data.get("data", data), apps_path)
if not apps_data:
return []
apps = []
for app_data in apps_data:
app_details = {}
for key, spec in ElementSpecs.Developer.items():
app_details[key] = spec.extract_content(app_data)
if app_details.get("title"):
apps.append(app_details)
return apps
[docs]
def format_developer_data(self, apps_data: List[Dict]) -> List[Dict]:
"""Format parsed developer apps into final structure.
Args:
apps_data: List of parsed apps
Returns:
List of formatted app dictionaries
"""
formatted_apps = []
for app in apps_data:
formatted_app = {
"appId": app.get("appId"),
"title": app.get("title"),
"description": app.get("description"),
"icon": app.get("icon"),
"developer": app.get("developer"),
"score": app.get("score"),
"scoreText": app.get("scoreText"),
"currency": app.get("currency"),
"price": app.get("price"),
"free": app.get("free"),
"url": app.get("url"),
}
formatted_apps.append(formatted_app)
return formatted_apps
[docs]
class SimilarParser:
"""Parser for extracting and formatting similar apps."""
[docs]
def parse_similar_data(self, dataset: Dict) -> List[Dict]:
"""Parse similar apps from dataset.
Args:
dataset: Raw dataset from scraper
Returns:
List of parsed similar app dictionaries
"""
ds3_data = dataset.get("ds:3", "")
if not ds3_data:
return []
from ..utils.helpers import clean_json_string, alternative_json_clean
json_str_cleaned = clean_json_string(ds3_data)
try:
data = json.loads(json_str_cleaned)
except json.JSONDecodeError as e:
try:
alternative_cleaned = alternative_json_clean(ds3_data)
data = json.loads(alternative_cleaned)
except Exception:
return []
apps_data = nested_lookup(data.get("data", data), [0, 1, 0, 21, 0])
if not apps_data:
return []
apps = []
for app_data in apps_data:
app_details = {}
for key, spec in ElementSpecs.Similar.items():
app_details[key] = spec.extract_content(app_data)
if app_details.get("title"):
apps.append(app_details)
return apps
[docs]
def format_similar_data(self, apps_data: List[Dict]) -> List[Dict]:
"""Format parsed similar apps into final structure.
Args:
apps_data: List of parsed apps
Returns:
List of formatted app dictionaries
"""
formatted_apps = []
for app in apps_data:
formatted_app = {
"appId": app.get("appId"),
"title": app.get("title"),
"description": app.get("description"),
"icon": app.get("icon"),
"developer": app.get("developer"),
"score": app.get("score"),
"scoreText": app.get("scoreText"),
"currency": app.get("currency"),
"price": app.get("price"),
"free": app.get("free"),
"url": app.get("url"),
}
formatted_apps.append(formatted_app)
return formatted_apps
[docs]
class ListParser:
"""Parser for extracting and formatting top chart apps."""
[docs]
def parse_list_data(self, dataset: Dict, count: int) -> List[Dict]:
"""Parse top chart apps from dataset.
Args:
dataset: Raw dataset from scraper
count: Maximum number of apps to parse
Returns:
List of parsed app dictionaries
"""
collection_data = dataset.get("collection_data")
if not collection_data:
return []
apps_data = nested_lookup(collection_data, [0, 1, 0, 28, 0])
if not apps_data:
return []
apps = []
for app_data in apps_data[:count]:
app_details = {}
for key, spec in ElementSpecs.List.items():
app_details[key] = spec.extract_content(app_data)
if app_details.get("title"):
apps.append(app_details)
return apps
[docs]
def format_list_data(self, apps_data: List[Dict]) -> List[Dict]:
"""Format parsed list apps into final structure.
Args:
apps_data: List of parsed apps
Returns:
List of formatted app dictionaries
"""
formatted_apps = []
for app in apps_data:
formatted_app = {
"appId": app.get("appId"),
"title": app.get("title"),
"description": app.get("description"),
"icon": app.get("icon"),
"screenshots": app.get("screenshots"),
"developer": app.get("developer"),
"genre": app.get("genre"),
"score": app.get("score"),
"scoreText": app.get("scoreText"),
"installs": app.get("installs"),
"currency": app.get("currency"),
"price": app.get("price"),
"free": app.get("free"),
"url": app.get("url"),
}
formatted_apps.append(formatted_app)
return formatted_apps
[docs]
class SuggestParser:
"""Parser for extracting and formatting search suggestions."""
[docs]
def parse_suggestions(self, dataset: Dict) -> List[str]:
"""Parse suggestions from dataset.
Args:
dataset: Raw dataset from scraper
Returns:
List of suggestion strings
"""
return dataset.get("suggestions", [])
[docs]
def format_suggestions(self, suggestions: List[str]) -> List[str]:
"""Format suggestions (pass-through for strings).
Args:
suggestions: List of suggestion strings
Returns:
Same list of suggestion strings
"""
return suggestions