Skip to content

Commit

Permalink
Add ytdl and fastdl method to scrap content
Browse files Browse the repository at this point in the history
  • Loading branch information
akashvaghela09 committed Jun 28, 2024
1 parent 0f043d0 commit 088edc5
Showing 1 changed file with 169 additions and 4 deletions.
173 changes: 169 additions & 4 deletions src/apis.js
Original file line number Diff line number Diff line change
@@ -1,6 +1,13 @@
const axios = require("axios");
const { findMediaByShortCode, log, cleanTimelineResponse } = require("./utils");
const { INSTAGRAM_API_URL } = require("./constants");
const {
findMediaByShortCode,
cleanTimelineResponse,
waitFor,
log,
} = require("./utils");
const { INSTAGRAM_API_URL, MEDIA_TYPE } = require("./constants");
const { exec } = require("child_process");
const { Browser } = require("./config");

const fetchOwnerId = async (shortCode) => {
try {
Expand All @@ -13,7 +20,7 @@ const fetchOwnerId = async (shortCode) => {
return { success: true, data: ownerId };
}
} catch (error) {
console.error("Error fetching owner ID:", error);
log("Error fetching owner ID:", error);
}

return { success: false };
Expand All @@ -27,7 +34,7 @@ const fetchTimelineData = async (ownerId, after = null) => {
const response = await axios.get(url);
return { success: true, data: response.data };
} catch (error) {
console.error("Error fetching timeline data:", error);
log("Error fetching timeline data:", error?.response?.data);
}
return {
success: false,
Expand Down Expand Up @@ -56,6 +63,7 @@ const getStreamDataRecursively = async (shortCode, ownerId, after = null) => {
}

if (pageInfo?.has_next_page) {
await waitFor(500);
return getStreamDataRecursively(
shortCode,
ownerId,
Expand All @@ -66,8 +74,165 @@ const getStreamDataRecursively = async (shortCode, ownerId, after = null) => {
return { success: false };
};

const getMediaUrl = async (instagramUrl) => {
// Command to execute yt-dlp to fetch video URL
const command = `./yt-dlp_linux -f b -g --cookies ./cookies.txt "${instagramUrl}"`;

try {
return new Promise((resolve, reject) => {
exec(command, (error, stdout, stderr) => {
if (error) {
console.error(`exec error: ${error}`);
return reject({ success: false, data: { mediaUrl: null } });
}

// If successful, stdout contains the direct video URL
const mediaUrl = stdout.trim(); // Trim whitespace, if any
resolve({
success: true,
data: {
mediaUrl,
mediaType: MEDIA_TYPE.VIDEO,
},
});
});
});
} catch (error) {
log("failed in exec command: ", error);
return { success: false, data: { mediaUrl: null } };
}
};

const scrapWithFastDl = async (requestUrl) => {
const browser = Browser.browserInstance;
let page;
const finalResponse = {
data: {},
success: false,
};

try {
page = await browser.newPage();

await page.goto("https://fastdl.app/en");
console.log("browser Went to fastdl");

// Wait for the input field to be ready and type the URL
await page.waitForSelector("#search-form-input");
await page.type("#search-form-input", requestUrl, { delay: 10 });
console.log("Typed URL into input field");

// Click the button with class search-form__button, type submit
await page.evaluate(() => {
const downloadButton = document.querySelector(
'.search-form__button[type="submit"]'
);
if (downloadButton) {
downloadButton.click();
}
});

try {
const captionElement = await page.waitForSelector(
".output-list__caption",
{ timeout: 5000 }
);

if (captionElement) {
const captionText = await page.evaluate(
(element) => element.textContent,
captionElement
);
finalResponse.data.caption = captionText.trim();
}
} catch (error) {
console.log("failed to scrap caption: ", error);
}

try {
// Wait for the <ul> element to be present
const ulElement = await page.waitForSelector(".output-list__list", {
timeout: 5000,
});

if (ulElement) {
// Evaluate in the context of the page to extract information from each <li> item
const mediaList = await page.evaluate((ul) => {
const itemList = [];
// Select all <li> elements under the <ul>
const liElements =
ul.querySelectorAll(".output-list__item");

// Loop through each <li> element
liElements.forEach((li) => {
// Extract mediaUrl from <a> tag
const aTag = li.querySelector("a");
const mediaUrl = aTag ? aTag.href : "";

// Extract displayUrl from <img> tag
const imgTag = li.querySelector("img");
const displayUrl = imgTag ? imgTag.src : "";

// Extract mediaType from <span> tag
const spanTag = li.querySelector("span");
const classString = spanTag ? spanTag.className : "";

// Push the extracted data into itemList
itemList.push({ mediaUrl, displayUrl, classString });
});

return itemList;
}, ulElement);

// for (let i = 0; i < mediaList.length; i++) {
// console.log(mediaList[i]);
// }

let firstItem = {};
if (mediaList.length > 1) {
finalResponse.data.mediaType = MEDIA_TYPE.MEDIA_GROUP;
firstItem = mediaList[0];

for (let i = 0; i < mediaList.length; i++) {
if (mediaList[i].classString.includes("video")) {
mediaList[i].mediaType = MEDIA_TYPE.VIDEO;
} else {
mediaList[i].mediaType = MEDIA_TYPE.IMAGE;
}
}
} else if (mediaList.length === 1) {
firstItem = mediaList.shift();

if (firstItem.classString.includes("video")) {
finalResponse.data.mediaType = MEDIA_TYPE.VIDEO;
} else {
finalResponse.data.mediaType = MEDIA_TYPE.IMAGE;
}
}
finalResponse.data.mediaUrl = firstItem.mediaUrl;
finalResponse.data.displayUrl = firstItem.displayUrl;
finalResponse.data.mediaList = mediaList;
finalResponse.success = true;
} else {
console.error("UL element not found");
}
} catch (error) {
console.error("Error scraping items:", error);
}
} catch (error) {
console.error("Error in scraping:", error);
} finally {
await page.close();
console.log("Page closed after scraping");
}

return finalResponse;
};

module.exports = {
fetchOwnerId,
fetchTimelineData,
getStreamDataRecursively,
getMediaUrl,
scrapWithFastDl,
};

0 comments on commit 088edc5

Please sign in to comment.