Added cookies and localstorage scraping

nuhmanpk · Nov 12, 2023 · 84f06d6 · 84f06d6
1 parent 78f93db
commit 84f06d6
Show file tree

Hide file tree

Showing 9 changed files with 111 additions and 7 deletions.
diff --git a/README.md b/README.md
@@ -1,8 +1,6 @@
 # WebScrapperRoBot
 Simple Web scrapper Bot to scrap webpages using Requests, html5lib and Beautifulsoup.
 
-![Screenshot](https://github.com/nuhmanpk/WebScrapper/blob/main/assets/new.png)
-
 # Setting Up a Project and Configuring Environment Variables
 
 To set up the project and configure environment variables, follow these steps:

diff --git a/demos/README.md b/demos/README.md
@@ -0,0 +1,13 @@
+# WebScrapper Bot Demos
+
+This folder contains screenshots showcasing the features of the WebScrapper bot.
+
+## Screenshots
+
+### Menu
+
+![Menu](menu.png)
+
+### Video Scraping
+
+![Video Scraping](video-scraping.png)
diff --git a/assets/new.png → demos/menu.png b/assets/new.png → demos/menu.png
diff --git a/demos/video-scraping.png b/demos/video-scraping.png
diff --git a/helpers.py b/helpers.py
@@ -3,6 +3,7 @@
 import requests
 from utils import FINISHED_PROGRESS_STR, UN_FINISHED_PROGRESS_STR
 import os
+from selenium import webdriver
 
 async def progress_bar(current, total):
     percentage = current / total
@@ -79,3 +80,10 @@ async def download_pdf(base_url, pdf_url, idx, media_type):
     except Exception as e:
         print(f"Error downloading PDF from {pdf_url}: {e}")
         return None
+
+async def init_headless_browser(url):
+    options = webdriver.ChromeOptions()
+    options.headless = True
+    driver = webdriver.Chrome(options=options)
+    driver.get(url)
+    return driver
diff --git a/main.py b/main.py
@@ -9,7 +9,7 @@
 from dotenv import load_dotenv
 import os
 from pyrogram.types import Message
-from scraper import all_audio_scraping, all_images_scraping, all_links_scraping, all_paragraph_scraping, all_pdf_scraping, all_video_scraping, html_data_scraping, raw_data_scraping
+from scraper import all_audio_scraping, all_images_scraping, all_links_scraping, all_paragraph_scraping, all_pdf_scraping, all_video_scraping, extract_cookies, extract_local_storage, html_data_scraping, raw_data_scraping
 from utils import OPTIONS, START_BUTTON, START_TEXT
 
 load_dotenv()
@@ -49,6 +49,14 @@ async def cb_data(bot, update):
         await all_video_scraping(update)
     elif update.data == "cballpdf":
         await all_pdf_scraping(update)
+    elif update.data == "cbmetadata":
+        # await all_pdf_scraping(update)
+        pass
+    elif update.data == "cbcookies":
+        await extract_cookies(update)
+    elif update.data == "cblocalstorage":
+        await extract_local_storage(update)
+
     else:
         await update.message.edit_text(
             text=START_TEXT,

diff --git a/requirements.txt b/requirements.txt
@@ -4,4 +4,5 @@ requests
 beautifulsoup4
 html5lib
 telegraph
-python-dotenv
+python-dotenv
+selenium
diff --git a/scraper.py b/scraper.py
@@ -7,7 +7,7 @@
 from bs4 import BeautifulSoup
 from urllib.parse import quote
 from utils import REPO
-from helpers import download_image, download_media, download_pdf, progress_bar
+from helpers import download_image, download_media, download_pdf, init_headless_browser, progress_bar
 
 async def scrape(url):
     try:
@@ -334,3 +334,70 @@ async def all_pdf_scraping(query):
             [[InlineKeyboardButton("Create Issue", url=error_link)]])
         await message.reply_text(text, disable_web_page_preview=True, quote=True, reply_markup=issue_markup)
         return e
+
+
+async def extract_cookies(query):
+    try:
+        url = query.message.text
+        message = query.message
+        chat_id = message.chat.id
+        txt = await message.reply_text("Initiating chrome Driver...", quote=True)
+        driver = await init_headless_browser(url)
+
+        await txt.edit('Getting cookies...')
+        cookies = driver.get_cookies()
+        await txt.edit('Preparing files...')
+        file_write = open(f'Cookies-{chat_id}.txt', 'a+')
+        file_write.write(f"{cookies}")
+        file_write.close()
+        await txt.edit('Uploading...')
+        await message.reply_document(f"Cookies-{chat_id}.txt", caption="©@BugHunterBots", quote=True)
+        await asyncio.sleep(1) 
+        os.remove(f"Cookies-{chat_id}.txt")
+        await txt.delete()       
+    except Exception as e:
+        os.remove(f"Cookies-{chat_id}.txt")
+        error = f"ERROR: {(str(e))}"
+        error_link = f"{REPO}/issues/new?title={quote(error)}"
+        text = f'Something Bad occurred !!!\nCreate an issue here'
+        issue_markup = InlineKeyboardMarkup(
+            [[InlineKeyboardButton("Create Issue", url=error_link)]])
+        await message.reply_text(text, disable_web_page_preview=True, quote=True, reply_markup=issue_markup)
+        return e
+
+async def extract_local_storage(query):
+    try:
+        url = query.message.text
+        message = query.message
+        chat_id = message.chat.id
+        txt = await message.reply_text("Initiating chrome Driver...", quote=True)
+        driver = await init_headless_browser(url)
+        local_storage_script = """
+        var storage = {};
+        for (var i = 0; i < localStorage.length; i++) {
+            var key = localStorage.key(i);
+            storage[key] = localStorage.getItem(key);
+        }
+        return storage;
+        """
+        await txt.edit('Executing script...')
+        local_storage = driver.execute_script(local_storage_script)
+        await txt.edit('Preparing files...')
+        file_write = open(f'localStorage-{chat_id}.txt', 'a+')
+        file_write.write(f"{local_storage}")
+        file_write.close()
+        await txt.edit('Uploading...')
+        await message.reply_document(f'localStorage-{chat_id}.txt', caption="©@BugHunterBots", quote=True)
+        await asyncio.sleep(1)
+        os.remove(f"localStorage-{chat_id}.txt")
+        await txt.delete()
+    except Exception as e:
+        os.remove(f"localStorage-{chat_id}.txt")
+        error = f"ERROR: {(str(e))}"
+        error_link = f"{REPO}/issues/new?title={quote(error)}"
+        text = f'Something Bad occurred !!!\nCreate an issue here'
+        issue_markup = InlineKeyboardMarkup(
+            [[InlineKeyboardButton("Create Issue", url=error_link)]])
+        await message.reply_text(text, disable_web_page_preview=True, quote=True, reply_markup=issue_markup)
+        return e
+
diff --git a/utils.py b/utils.py
@@ -24,8 +24,10 @@
     ]
 )
 
+BACK_BUTTON = [[InlineKeyboardButton('Back', callback_data='cbclose')]]
+
 CLOSE_BUTTON = InlineKeyboardMarkup(
-    [[InlineKeyboardButton('Back', callback_data='cbclose')]]
+    BACK_BUTTON
 )
 
 OPTIONS = InlineKeyboardMarkup(
@@ -47,6 +49,13 @@
         ],
         [
             InlineKeyboardButton('📚 All PDFs', callback_data='cballpdf')
-        ]
+        ],
+        [
+            InlineKeyboardButton('🍪 Cookies', callback_data='cbcookies'),
+            InlineKeyboardButton('📦 LocalStorage', callback_data='cblocalstorage')
+        ],
+        [
+            InlineKeyboardButton('📊 Metadata', callback_data='cbmetadata')
+        ]   
     ]
 )