Update scraper.ts — Write improved function for web scraping the UD website
This commit is contained in:
parent
8c82661bac
commit
6a896efaef
|
@ -0,0 +1,127 @@
|
||||||
|
import axios from "axios";
|
||||||
|
import * as cheerio from "cheerio";
|
||||||
|
|
||||||
|
// TODO: Move promise handling function to util file
|
||||||
|
/**
|
||||||
|
* Util function that resolves a Promise
|
||||||
|
* @param promise The Promise to be handled
|
||||||
|
* @returns whether the Promises was successfully resolved & its result
|
||||||
|
*/
|
||||||
|
async function handlePromise(promise: Promise<any>) {
|
||||||
|
try {
|
||||||
|
const data = await promise;
|
||||||
|
|
||||||
|
return [true, data];
|
||||||
|
} catch (err) {
|
||||||
|
return [false, err];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// TODO:
|
||||||
|
// Rewrite in organized manner
|
||||||
|
// Retrive author
|
||||||
|
// Retrieve ratings
|
||||||
|
// Retrieve date
|
||||||
|
// Retrieve example(s)
|
||||||
|
|
||||||
|
type rating = {
|
||||||
|
upvotes: number;
|
||||||
|
downvotes: number;
|
||||||
|
};
|
||||||
|
|
||||||
|
type author = {
|
||||||
|
name: string;
|
||||||
|
url: string;
|
||||||
|
};
|
||||||
|
|
||||||
|
type defintion = {
|
||||||
|
meaning: string;
|
||||||
|
example: string;
|
||||||
|
rating: rating;
|
||||||
|
author: author;
|
||||||
|
};
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Gets the definition for a term
|
||||||
|
* @param term The term to be defined
|
||||||
|
*/
|
||||||
|
function getDefinitions(term: string) {
|
||||||
|
// TODO: Accept params that limit/filter the definition
|
||||||
|
// i.e: Web scrape from page N
|
||||||
|
// Only web scrape N results
|
||||||
|
|
||||||
|
return new Promise(async function (resolve, reject) {
|
||||||
|
const [resolved, data] = await handlePromise(
|
||||||
|
axios.get(`https://www.urbandictionary.com/define.php?term=${term}`)
|
||||||
|
);
|
||||||
|
|
||||||
|
const defintions: defintion[] = [];
|
||||||
|
|
||||||
|
if (resolved == true) {
|
||||||
|
const html = data.data;
|
||||||
|
const $ = cheerio.load(html);
|
||||||
|
|
||||||
|
// TODO: Look into selector, as divs that do not contain definitions
|
||||||
|
// are being selected too
|
||||||
|
const definitionDivs = $(".p-5");
|
||||||
|
|
||||||
|
definitionDivs.each(function (i, element) {
|
||||||
|
const meaning = $(this).find(".meaning.mb-4").first().text();
|
||||||
|
const example = $(this).find(".example.italic.mb-4").first().text();
|
||||||
|
|
||||||
|
const authorDiv = $(this).find(".contributor");
|
||||||
|
|
||||||
|
const author = $(authorDiv).find("a").first();
|
||||||
|
const authorName = author.text().trim();
|
||||||
|
const authorLink = author.attr("href");
|
||||||
|
|
||||||
|
// TODO: Finish ratings
|
||||||
|
// const upvotes = $(this).find(".thumbs[data-direction=up]").find("span");
|
||||||
|
|
||||||
|
// @ts-ignore
|
||||||
|
const definition = {
|
||||||
|
meaning: meaning,
|
||||||
|
example: example,
|
||||||
|
rating: {
|
||||||
|
upvotes: 0,
|
||||||
|
downvotes: 0,
|
||||||
|
},
|
||||||
|
author: {
|
||||||
|
name: authorName,
|
||||||
|
url: authorLink!,
|
||||||
|
},
|
||||||
|
};
|
||||||
|
|
||||||
|
//console.log(definition);
|
||||||
|
|
||||||
|
defintions.push(definition);
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
resolve(defintions);
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
getDefinitions("yogurt");
|
||||||
|
|
||||||
|
export default function scrapeData(url: string) {
|
||||||
|
return new Promise(async function (resolve, reject) {
|
||||||
|
const [success, data] = await handlePromise(axios.get(url));
|
||||||
|
|
||||||
|
if (success) {
|
||||||
|
const html = data.data;
|
||||||
|
|
||||||
|
const $ = cheerio.load(html);
|
||||||
|
|
||||||
|
// For now, only scraping the first/top meaning;
|
||||||
|
|
||||||
|
const meaning = $(".meaning.mb-4").first().text().trim();
|
||||||
|
|
||||||
|
resolve(meaning);
|
||||||
|
} else {
|
||||||
|
console.error(data);
|
||||||
|
|
||||||
|
reject(data);
|
||||||
|
}
|
||||||
|
});
|
||||||
|
}
|
Loading…
Reference in New Issue