Skip to content

Commit

Permalink
Merge pull request #11 from YasogaN/dev
Browse files Browse the repository at this point in the history
+ Code optimization to reduce package size
+ Bump Package Version
  • Loading branch information
YasogaN authored Dec 2, 2024
2 parents 269a8ff + 66b7a6a commit b711312
Show file tree
Hide file tree
Showing 6 changed files with 97 additions and 108 deletions.
4 changes: 2 additions & 2 deletions package-lock.json

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion package.json
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
"name": "google-maps-review-scraper",
"version": "1.3.0",
"version": "1.3.1",
"description": "A review scraper made for google maps",
"main": "index.js",
"type": "module",
Expand Down
33 changes: 13 additions & 20 deletions src/listenentities.js
Original file line number Diff line number Diff line change
@@ -1,25 +1,18 @@
import { hexToDec } from "hex2dec";

/**
* Parses a google maps place url to a url with the listenentitesreviews endpoint
* @param {string} url URL of the google maps place as a string
* @param {string} [page] base64 encoding of the page number
* @returns {string} link to fetch reviews
* Converts a Google Maps place URL to a reviews endpoint URL.
* @param {string} url Google Maps place URL.
* @param {string} [p=""] Base64 encoding of the page number.
* @returns {string} URL to fetch reviews.
* @throws Will throw an error if the URL is invalid.
*/
export default async function (url, page = "") {
const match = url.match(/!1s([a-zA-Z0-9_:]+)!/);
if (!match && !match[1]) {
throw new Error('Invalid URL')
export default function parseReviewURL(url, p = "") {
const m = url.match(/!1s([a-zA-Z0-9_:]+)!/);
if (!m || !m[1]) {
throw new Error("Invalid URL")
}
const hex = match[1].split(':');
const d = hex.map(part => hexToDec(part));
var p
if (page.length == 0) {
p = '!2m1!2i10'
}
else {
p = '!2m2!2i10!3s' + page;
}
return `https://www.google.com/maps/preview/review/listentitiesreviews?authuser=0&hl=en&gl=in&pb=!1m2!1y${d[0]}!2y${d[1]}${p}!3e1!4m5!3b1!4b1!5b1!6b1!7b1!5m2!1sdzvaXrvAMImImAXHsLPICA!7e81`;
};

const [h1, h2] = m[1].split(":").map(hexToDec)
const pS = p ? `!2m2!2i10!3s${p}` : `!2m1!2i10`
return `https://www.google.com/maps/preview/review/listentitiesreviews?authuser=0&hl=en&gl=in&pb=!1m2!1y${h1}!2y${h2}${pS}!3e1!4m5!3b1!4b1!5b1!6b1!7b1!5m2!1sdzvaXrvAMImImAXHsLPICA!7e81`;
}
31 changes: 14 additions & 17 deletions src/listugcposts.js
Original file line number Diff line number Diff line change
@@ -1,20 +1,17 @@
/**
* Parses a google maps place url to a url with the listugcposts endpoint
* @param {string} url URL of the google maps place as a string
* @param {1|2|3|4} sort - Sorting order of reviews (1 for Most Relevant, 2 for Newest, 3 for Highest Rating, 4 for Lowest Rating)
* @param {string} [page=""] base64 encoding of the page number
* @param {string} [search=""] search query if searching for something in reviews
* @returns {string} link to fetch reviews
* Converts a Google Maps place URL to a listugcposts endpoint URL.
* @param {string} url Google Maps place URL.
* @param {1|2|3|4} so Sorting order (1: Most Relevant, 2: Newest, 3: Highest Rating, 4: Lowest Rating).
* @param {string} [pg=""] Base64 encoding of the page number.
* @param {string} [sq=""] Search query for filtering reviews.
* @returns {string} URL to fetch reviews.
* @throws Will throw an error if the URL is invalid.
*/
export default async function (url, sort, page="", search="") {
try {
const match = url.match(/!1s([a-zA-Z0-9_:]+)!/);
if (!match && !match[1]) {
throw new Error('Invalid URL')
}
return `https://www.google.com/maps/rpc/listugcposts?authuser=0&hl=en&gl=in&pb=!1m7!1s${match[1]}!3s${search}!6m4!4m1!1e1!4m1!1e3!2m2!1i10!2s${page}!5m2!1sBnOwZvzePPfF4-EPy7LK0Ak!7e81!8m5!1b1!2b1!3b1!5b1!7b1!11m6!1e3!2e1!3sen!4slk!6m1!1i2!13m1!1e${sort}`;
export default function (url, so, pg = "", sq = "") {
const m = url.match(/!1s([a-zA-Z0-9_:]+)!/);
if (!m || !m[1]) {
throw new Error("Invalid URL");
}
catch (e) {
console.error(e)
}
};

return `https://www.google.com/maps/rpc/listugcposts?authuser=0&hl=en&gl=in&pb=!1m7!1s${m[1]}!3s${sq}!6m4!4m1!1e1!4m1!1e3!2m2!1i10!2s${pg}!5m2!1sBnOwZvzePPfF4-EPy7LK0Ak!7e81!8m5!1b1!2b1!3b1!5b1!7b1!11m6!1e3!2e1!3sen!4slk!6m1!1i2!13m1!1e${so}`;
}
78 changes: 40 additions & 38 deletions src/parser.js
Original file line number Diff line number Diff line change
@@ -1,41 +1,43 @@
export default async function parser(reviews) {
return JSON.stringify(await Promise.all(reviews.map(review => {
review = review[0];
const parsedReview = {
review_id: review[0],
time: {
published: review[1][2],
last_edited: review[1][3],
/**
* Parses an array of reviews and returns a minified JSON string of the parsed reviews.
* @param {Array} reviews - The array of reviews to parse. Each review is expected to be an array with specific nested structures.
* @returns {Promise<string>} A promise that resolves to a JSON string of the parsed reviews.
*
*/
export default async function parseReviews(reviews) {
const parsedReviews = await Promise.all(reviews.map(([review]) => ({
review_id: review[0],
time: {
published: review[1][2],
last_edited: review[1][3],
},
author: {
name: review[1][4][5][0],
profile_url: review[1][4][5][1],
url: review[1][4][5][2][0],
id: review[1][4][5][3],
},
review: {
rating: review[2][0][0],
text: review[2][15]?.[0]?.[0] || null,
language: review[2][14]?.[0] || null,
},
images: review[2][2]?.map(image => ({
id: image[0],
url: image[1][6][0],
size: {
width: image[1][6][2][0],
height: image[1][6][2][1],
},
author: {
name: review[1][4][5][0],
profile_url: review[1][4][5][1],
url: review[1][4][5][2][0],
id: review[1][4][5][3],
location: {
friendly: image[1][21][3][7]?.[0],
lat: image[1][8][0][2],
long: image[1][8][0][1],
},
review: {
rating: review[2][0][0],
text: review[2][15] ? review[2][15][0][0] : null,
langage: review[2][14] ? review[2][14][0] : null,
},
images: review[2][2] ? review[2][2].map(image => {
return {
id: image[0],
url: image[1][6][0],
size: {
width: image[1][6][2][0],
height: image[1][6][2][1],
},
location: {
friendly: image[1][21][3][7][0],
lat: image[1][8][0][2],
long: image[1][8][0][1],
},
caption: image[1][21][3][5] ? image[1][21][3][5][0] : null,
}
}) : null,
source: review[1][13][0]
}
return parsedReview;
})), null, 2);
caption: image[1][21][3][5]?.[0] || null,
})) || null,
source: review[1][13][0],
})));

return JSON.stringify(parsedReviews, null, 2);
}
57 changes: 27 additions & 30 deletions src/utils.js
Original file line number Diff line number Diff line change
@@ -1,80 +1,77 @@
import listugcposts from "./listugcposts.js";
import axios from "axios";
import { SortEnum } from "./types.js";
import { URL } from 'url';
import { URL } from "url";
import parser from "./parser.js";

/**
* Helper function that validates the parameters for the Google Maps review scraper.
* Validates parameters for the Google Maps review scraper.
*
* @param {string} url - The URL to validate. Must include "https://www.google.com/maps/place/".
* @param {string} sort_type - The sort type to validate. Must be a valid key in SortEnum.
* @param {string|number} pages - The number of pages to validate. Must be "max" or a number.
* @param {boolean} clean - The clean option to validate. Must be a boolean.
* @throws {Error} Throws an error if the URL is invalid.
* @throws {Error} Throws an error if the sort type is invalid.
* @throws {Error} Throws an error if the pages value is invalid.
* @param {string} url - Must include "https://www.google.com/maps/place/".
* @param {string} sort_type - Must be a valid key in SortEnum.
* @param {string|number} pages - "max" or a number.
* @param {boolean} clean - Must be a boolean.
* @throws {Error} If any parameter is invalid.
*/
export function validateParams(url, sort_type, pages, clean) {
const parsedUrl = new URL(url);
if (parsedUrl.host !== "www.google.com" || !parsedUrl.pathname.startsWith("/maps/place/")) {
throw new Error(`Invalid URL: ${url}`);
}
if (!SortEnum[sort_type]) {
throw new Error(`Invalid sort value: ${sort_type}`);
throw new Error(`Invalid sort type: ${sort_type}`);
}
if (pages !== "max" && isNaN(pages)) {
throw new Error(`Invalid pages value: ${pages}`);
}
if (typeof clean !== "boolean") {
throw new Error(`Invalid value for clean value: ${clean}`);
throw new Error(`Invalid value for 'clean': ${clean}`);
}
}

/**
* Helper function to fetche reviews from a given URL with specified sorting and pagination options.
* Fetches reviews from a given URL with sorting and pagination options.
*
* @param {string} url - The URL to fetch reviews from.
* @param {string} sort - The sorting option for the reviews.
* @param {string} [nextPage=""] - The token for the next page of reviews, if any.
* @param {string} [search_query=""] - The search query to filter reviews, if any.
* @returns {Promise<Object>} A promise that resolves to the parsed JSON data of reviews.
* @throws {Error} If the request fails or the response status is not 200.
* @param {string} [nextPage=""] - Token for the next page, if any.
* @param {string} [search_query=""] - Search query to filter reviews, if any.
* @returns {Promise<Object>} Parsed JSON data of reviews.
* @throws {Error} If the request fails or the response is invalid.
*/
export async function fetchReviews(url, sort, nextPage = "", search_query = "") {
const apiUrl = await listugcposts(url, sort, nextPage, search_query);
const response = await axios.get(apiUrl);
if (response.status !== 200) {
throw new Error(`Failed to fetch reviews: ${response.status}`);
}
const data = response.data.split(")]}'")[1];
return JSON.parse(data);
const rawData = response.data.split(")]}'")[1];
return JSON.parse(rawData);
}

/**
* Helper function to paginate through reviews from a given URL, sorting and searching as specified.
* Paginates through reviews from a given URL.
*
* @param {string} url - The URL to fetch reviews from.
* @param {string} sort - The sorting parameter for the reviews.
* @param {string|number} pages - The number of pages to paginate through, or "max" for all pages.
* @param {string} search_query - The search query to filter reviews.
* @param {Array} initialData - The initial data containing reviews and the next page token.
* @returns {Promise<Array>} - A promise that resolves to an array of reviews.
* @param {string} sort - Sorting parameter for reviews.
* @param {string|number} pages - Number of pages or "max".
* @param {string} search_query - Search query to filter reviews.
* @param {boolean} clean - Whether to clean and parse the data.
* @param {Array} initialData - Initial data containing reviews and next page token.
* @returns {Promise<Array>} Array of reviews or parsed reviews.
*/
export async function paginateReviews(url, sort, pages, search_query, clean, initialData) {
let reviews = initialData[2];
let nextPage = initialData[1]?.replace(/"/g, "");
let currentPage = 2;

while (nextPage && (pages === "max" || currentPage <= parseInt(pages))) {
while (nextPage && (pages === "max" || currentPage <= +pages)) {
console.log(`Scraping page ${currentPage}...`);
const data = await fetchReviews(url, sort, nextPage, search_query);
reviews = reviews.concat(data[2]);
if (!data[1]) break;
reviews = [...reviews, ...data[2]];
nextPage = data[1]?.replace(/"/g, "");
if (!nextPage) break;
await new Promise(resolve => setTimeout(resolve, 1000)); // Avoid rate-limiting
currentPage++;
}

return clean ? await parser(reviews) : reviews;
}
}

0 comments on commit b711312

Please sign in to comment.