From 6a896efaefe1a6609f3d4753ca3ce8c960e531c5 Mon Sep 17 00:00:00 2001 From: Amelia <9247739-limesey@users.noreply.gitlab.com> Date: Sat, 4 Jun 2022 17:07:51 +0100 Subject: [PATCH] =?UTF-8?q?Update=20scraper.ts=20=E2=80=94=20Write=20impro?= =?UTF-8?q?ved=20function=20for=20web=20scraping=20the=20UD=20website?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/scraper.ts | 127 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 127 insertions(+) create mode 100644 src/scraper.ts diff --git a/src/scraper.ts b/src/scraper.ts new file mode 100644 index 0000000..e32fa5c --- /dev/null +++ b/src/scraper.ts @@ -0,0 +1,127 @@ +import axios from "axios"; +import * as cheerio from "cheerio"; + +// TODO: Move promise handling function to util file +/** + * Util function that resolves a Promise + * @param promise The Promise to be handled + * @returns whether the Promises was successfully resolved & its result + */ +async function handlePromise(promise: Promise) { + try { + const data = await promise; + + return [true, data]; + } catch (err) { + return [false, err]; + } +} + +// TODO: +// Rewrite in organized manner +// Retrive author +// Retrieve ratings +// Retrieve date +// Retrieve example(s) + +type rating = { + upvotes: number; + downvotes: number; +}; + +type author = { + name: string; + url: string; +}; + +type defintion = { + meaning: string; + example: string; + rating: rating; + author: author; +}; + +/** + * Gets the definition for a term + * @param term The term to be defined + */ +function getDefinitions(term: string) { + // TODO: Accept params that limit/filter the definition + // i.e: Web scrape from page N + // Only web scrape N results + + return new Promise(async function (resolve, reject) { + const [resolved, data] = await handlePromise( + axios.get(`https://www.urbandictionary.com/define.php?term=${term}`) + ); + + const defintions: defintion[] = []; + + if (resolved == true) { + const html = data.data; + const $ = cheerio.load(html); + + // TODO: Look into selector, as divs that do not contain definitions + // are being selected too + const definitionDivs = $(".p-5"); + + definitionDivs.each(function (i, element) { + const meaning = $(this).find(".meaning.mb-4").first().text(); + const example = $(this).find(".example.italic.mb-4").first().text(); + + const authorDiv = $(this).find(".contributor"); + + const author = $(authorDiv).find("a").first(); + const authorName = author.text().trim(); + const authorLink = author.attr("href"); + + // TODO: Finish ratings + // const upvotes = $(this).find(".thumbs[data-direction=up]").find("span"); + + // @ts-ignore + const definition = { + meaning: meaning, + example: example, + rating: { + upvotes: 0, + downvotes: 0, + }, + author: { + name: authorName, + url: authorLink!, + }, + }; + + //console.log(definition); + + defintions.push(definition); + }); + } + + resolve(defintions); + }); +} + +getDefinitions("yogurt"); + +export default function scrapeData(url: string) { + return new Promise(async function (resolve, reject) { + const [success, data] = await handlePromise(axios.get(url)); + + if (success) { + const html = data.data; + + const $ = cheerio.load(html); + + // For now, only scraping the first/top meaning; + + const meaning = $(".meaning.mb-4").first().text().trim(); + + resolve(meaning); + } else { + console.error(data); + + reject(data); + } + }); +}