3 Simple Steps to Land Software Engineering Interviews During the Pandemic (ft. New Grads)
Back in the day, websites were primarily rendered server side. This made it easy for bots to crawl and extract content from the page without using Javascript. However in the last couple of years, we've seen an increase of new Javascript frameworks. With the rise of client side rendered apps, it has become a bit more difficult to crawl websites since some sites require Javascript to render. Some client side rendered pages fetch more content via infinitely scrolling making it impossible for a traditional bot to crawl. Thankfully, we can create a bot that can render Javascript to crawl these types of web apps.
In this tutorial, we'll be using puppeteer to crawl Instagram. You can use this library to control Chromium to crawl websites.
Start by creating a new project with npm init
. Add puppeteer and request to your package.json
via:
npm add puppeteer request
Your package.json
will look something like this afterwards.
{
"name": "infinite-crawler",
"version": "1.0.0",
"description": "",
"main": "index.js",
"scripts": {
"test": "echo \"Error: no test specified\" && exit 1"
},
"author": "",
"license": "ISC",
"dependencies": {
"puppeteer": "^5.2.1",
"request": "^2.88.2"
}
}
Now let's create an index.js
file and populate it with the following code. We'll be setting headless to false to visualize what's going on.
const fs = require('fs');
const puppeteer = require('puppeteer');
const instagramUrl = "https://www.instagram.com/";
(async function main() {
// Set up browser and page.
const browser = await puppeteer.launch({
headless: false,
args: ['--no-sandbox', '--disable-setuid-sandbox'],
});
const page = await browser.newPage();
page.setViewport({ width: 1600, height: 1600 });
const url = "https://www.instagram.com/thejenkinscomic";
await page.goto(url);
// Close the browser.
await browser.close();
})();
Let's test this out by running node index
. If done correctly, you'll see Chrome open up the Instagram profile for thejenkinscomic.
Next, let's add some code that will scroll through the page to fetch more Instagram posts. The scrollToEndOfPage
function will continue scrolling until it hits the end of the page.
Instagram does something interesting to optimize for performance. Instagram optimizes the number of divs on the page by reusing the previously created divs instead of creating new divs on the page. This is the same concept used by Android Apps with a RecyclerView. Because the page is constantly changing, we'll need to scrape the page each time we scroll to make sure we have all the items. This also prevents crawlers that don't use Javascript from scraping the profile completely.
async function scrollToEndOfPage(
page,
extractImgLinks = () => {},
) {
let items = [];
try {
let previousHeight;
while (true) {
const curHeight = await page.evaluate('document.body.scrollHeight');
if (previousHeight === curHeight) {
break;
}
previousHeight = curHeight;
await page.evaluate('window.scrollTo(0, document.body.scrollHeight)');
await page.waitForFunction(`document.body.scrollHeight > ${previousHeight}`);
await page.waitFor(3500);
const links = await page.evaluate(extractImgLinks).catch(err => {
console.log(err);
return [];
});
items = [...items, ...links];
}
} catch (e) {
}
return items;
}
Update the main
function to call the scroller and then run node index
. This time the bot will open Instagram and scroll until there are no more posts left to fetch.
(async function main() {
...
await scrollToEndOfPage(page);
// Close the browser.
await browser.close();
});
Now we'll need to create a function that will scrape all the img tags. If you inspect the DOM of Instagram, you'll notice the posts have a src
and srcset
. For the sake of simplicity, we'll extract the src
, but if you're feeling ambitious you can scrape the srcset and grab the highest resolution image. You can learn more about srcset from this link.
// Finds all links with images as a child.
function extractImgLinks() {
const extractedElements = document.querySelectorAll('a img');
const items = [];
for (let element of extractedElements) {
items.push(element.src)
}
return items;
}
Update the scrollToEndOfPage
inside main
to scrape all the images using the extractImgLinks
function we wrote. Pass the function we just wrote to scrollToEndOfPage
.
(async function main() {
...
const links = await scrollToEndOfPage(page, extractImgLinks);
console.log(links);
...
await browser.close();
});
Finally, we'll need a function to download and save all the images that we scraped.
function download(url, path) {
const fileName = url.substring(url.lastIndexOf("/"));
const outFile = fileName.indexOf("?") >= 0 ? path + fileName.substring(0, fileName.indexOf("?")) : path+ fileName;
return new Promise((resolve) => {
request.head(url, (err) => {
request(url)
.on('error', function (err) {
console.log(err);
resolve();
})
.pipe(fs.createWriteStream(outFile))
.on('close', () => {
resolve();
})
});
});
}
We'll also need to add code to create an output directory. Because Instagram reuses the DOM to render the page, we might end up with duplicate links. The removeDuplicates
function will filter any duplicate links.
const fs = require('fs');
(async function main() {
...
// Creates a directory to store the images
const username = url.substring(instagramUrl.length);
const dir = "./" + username;
if (!fs.existsSync(dir)) {
fs.mkdirSync(dir);
}
// Filter duplicate links and downloads all the images
function removeDuplicates(array) {
return array.filter((a, b) => array.indexOf(a) === b)
};
const linksWithoutDuplicates = removeDuplicates(links);
await Promise.all(links.map(item => download(item, dir)));
});
Here's the final code. This is obviously very basic and can be updated to grab a higher resolution image via the srcset
. Other improvements include passing the profile url by args instead of hardcoding it.
const fs = require('fs');
const puppeteer = require('puppeteer');
const request = require('request');
const instagramUrl = "https://www.instagram.com/";
function download(url, path) {
const fileName = url.substring(url.lastIndexOf("/"));
const outFile = fileName.indexOf("?") >= 0 ? path + fileName.substring(0, fileName.indexOf("?")) : path+ fileName;
return new Promise((resolve) => {
request.head(url, (err) => {
request(url)
.on('error', function (err) {
console.log(err);
resolve();
})
.pipe(fs.createWriteStream(outFile))
.on('close', () => {
resolve();
})
});
});
}
// Finds all links with images as a child.
function extractImgLinks() {
const extractedElements = document.querySelectorAll('a img');
const items = [];
for (let element of extractedElements) {
items.push(element.src)
}
return items;
}
async function scrollToEndOfPage(
page,
extractImgLinks = () => {},
) {
let items = [];
try {
let previousHeight;
while (true) {
const curHeight = await page.evaluate('document.body.scrollHeight');
if (previousHeight === curHeight) {
break;
}
previousHeight = curHeight;
await page.evaluate('window.scrollTo(0, document.body.scrollHeight)');
await page.waitForFunction(`document.body.scrollHeight > ${previousHeight}`);
await page.waitFor(3500);
const links = await page.evaluate(extractImgLinks).catch(err => {
console.log(err);
return [];
});
items = [...items, ...links];
}
} catch (e) {
}
return items;
}
(async function main() {
// Set up browser and page.
const browser = await puppeteer.launch({
headless: false,
args: ['--no-sandbox', '--disable-setuid-sandbox'],
});
const page = await browser.newPage();
page.setViewport({ width: 1600, height: 1600 });
const url = "https://www.instagram.com/thejenkinscomic"
await page.goto(url);
// Scroll and extract items from the page.
const links = await scrollToEndOfPage(page, extractImgLinks);
// Close the browser.
await browser.close();
// Creates a directory to store the images
const username = url.substring(instagramUrl.length);
const dir = "./" + username;
if (!fs.existsSync(dir)) {
fs.mkdirSync(dir);
}
// Downloads all the images
function removeDuplicates(array) {
return array.filter((a, b) => array.indexOf(a) === b)
};
const linksWithoutDuplicates = removeDuplicates(links);
await Promise.all(linksWithoutDuplicates.map(item => download(item, dir)));
})();
Now you know how to build your very own front end rendered crawler. You can adapt this to your own use-cases. What do you plan on building? Let me know in the comments section below and check out Programming in School vs Working as a Software Engineer.