[go: nahoru, domu]

Skip to content

Commit

Permalink
Added cookies and localstorage scraping
Browse files Browse the repository at this point in the history
  • Loading branch information
nuhmanpk committed Nov 12, 2023
1 parent 78f93db commit 84f06d6
Show file tree
Hide file tree
Showing 9 changed files with 111 additions and 7 deletions.
2 changes: 0 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,8 +1,6 @@
# WebScrapperRoBot
Simple Web scrapper Bot to scrap webpages using Requests, html5lib and Beautifulsoup.

![Screenshot](https://github.com/nuhmanpk/WebScrapper/blob/main/assets/new.png)

# Setting Up a Project and Configuring Environment Variables

To set up the project and configure environment variables, follow these steps:
Expand Down
13 changes: 13 additions & 0 deletions demos/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
# WebScrapper Bot Demos

This folder contains screenshots showcasing the features of the WebScrapper bot.

## Screenshots

### Menu

![Menu](menu.png)

### Video Scraping

![Video Scraping](video-scraping.png)
File renamed without changes
Binary file added demos/video-scraping.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
8 changes: 8 additions & 0 deletions helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import requests
from utils import FINISHED_PROGRESS_STR, UN_FINISHED_PROGRESS_STR
import os
from selenium import webdriver

async def progress_bar(current, total):
percentage = current / total
Expand Down Expand Up @@ -79,3 +80,10 @@ async def download_pdf(base_url, pdf_url, idx, media_type):
except Exception as e:
print(f"Error downloading PDF from {pdf_url}: {e}")
return None

async def init_headless_browser(url):
options = webdriver.ChromeOptions()
options.headless = True
driver = webdriver.Chrome(options=options)
driver.get(url)
return driver
10 changes: 9 additions & 1 deletion main.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
from dotenv import load_dotenv
import os
from pyrogram.types import Message
from scraper import all_audio_scraping, all_images_scraping, all_links_scraping, all_paragraph_scraping, all_pdf_scraping, all_video_scraping, html_data_scraping, raw_data_scraping
from scraper import all_audio_scraping, all_images_scraping, all_links_scraping, all_paragraph_scraping, all_pdf_scraping, all_video_scraping, extract_cookies, extract_local_storage, html_data_scraping, raw_data_scraping
from utils import OPTIONS, START_BUTTON, START_TEXT

load_dotenv()
Expand Down Expand Up @@ -49,6 +49,14 @@ async def cb_data(bot, update):
await all_video_scraping(update)
elif update.data == "cballpdf":
await all_pdf_scraping(update)
elif update.data == "cbmetadata":
# await all_pdf_scraping(update)
pass
elif update.data == "cbcookies":
await extract_cookies(update)
elif update.data == "cblocalstorage":
await extract_local_storage(update)

else:
await update.message.edit_text(
text=START_TEXT,
Expand Down
3 changes: 2 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,4 +4,5 @@ requests
beautifulsoup4
html5lib
telegraph
python-dotenv
python-dotenv
selenium
69 changes: 68 additions & 1 deletion scraper.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
from bs4 import BeautifulSoup
from urllib.parse import quote
from utils import REPO
from helpers import download_image, download_media, download_pdf, progress_bar
from helpers import download_image, download_media, download_pdf, init_headless_browser, progress_bar

async def scrape(url):
try:
Expand Down Expand Up @@ -334,3 +334,70 @@ async def all_pdf_scraping(query):
[[InlineKeyboardButton("Create Issue", url=error_link)]])
await message.reply_text(text, disable_web_page_preview=True, quote=True, reply_markup=issue_markup)
return e


async def extract_cookies(query):
try:
url = query.message.text
message = query.message
chat_id = message.chat.id
txt = await message.reply_text("Initiating chrome Driver...", quote=True)
driver = await init_headless_browser(url)

await txt.edit('Getting cookies...')
cookies = driver.get_cookies()
await txt.edit('Preparing files...')
file_write = open(f'Cookies-{chat_id}.txt', 'a+')
file_write.write(f"{cookies}")
file_write.close()
await txt.edit('Uploading...')
await message.reply_document(f"Cookies-{chat_id}.txt", caption="©@BugHunterBots", quote=True)
await asyncio.sleep(1)
os.remove(f"Cookies-{chat_id}.txt")
await txt.delete()
except Exception as e:
os.remove(f"Cookies-{chat_id}.txt")
error = f"ERROR: {(str(e))}"
error_link = f"{REPO}/issues/new?title={quote(error)}"
text = f'Something Bad occurred !!!\nCreate an issue here'
issue_markup = InlineKeyboardMarkup(
[[InlineKeyboardButton("Create Issue", url=error_link)]])
await message.reply_text(text, disable_web_page_preview=True, quote=True, reply_markup=issue_markup)
return e

async def extract_local_storage(query):
try:
url = query.message.text
message = query.message
chat_id = message.chat.id
txt = await message.reply_text("Initiating chrome Driver...", quote=True)
driver = await init_headless_browser(url)
local_storage_script = """
var storage = {};
for (var i = 0; i < localStorage.length; i++) {
var key = localStorage.key(i);
storage[key] = localStorage.getItem(key);
}
return storage;
"""
await txt.edit('Executing script...')
local_storage = driver.execute_script(local_storage_script)
await txt.edit('Preparing files...')
file_write = open(f'localStorage-{chat_id}.txt', 'a+')
file_write.write(f"{local_storage}")
file_write.close()
await txt.edit('Uploading...')
await message.reply_document(f'localStorage-{chat_id}.txt', caption="©@BugHunterBots", quote=True)
await asyncio.sleep(1)
os.remove(f"localStorage-{chat_id}.txt")
await txt.delete()
except Exception as e:
os.remove(f"localStorage-{chat_id}.txt")
error = f"ERROR: {(str(e))}"
error_link = f"{REPO}/issues/new?title={quote(error)}"
text = f'Something Bad occurred !!!\nCreate an issue here'
issue_markup = InlineKeyboardMarkup(
[[InlineKeyboardButton("Create Issue", url=error_link)]])
await message.reply_text(text, disable_web_page_preview=True, quote=True, reply_markup=issue_markup)
return e

13 changes: 11 additions & 2 deletions utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,8 +24,10 @@
]
)

BACK_BUTTON = [[InlineKeyboardButton('Back', callback_data='cbclose')]]

CLOSE_BUTTON = InlineKeyboardMarkup(
[[InlineKeyboardButton('Back', callback_data='cbclose')]]
BACK_BUTTON
)

OPTIONS = InlineKeyboardMarkup(
Expand All @@ -47,6 +49,13 @@
],
[
InlineKeyboardButton('📚 All PDFs', callback_data='cballpdf')
]
],
[
InlineKeyboardButton('🍪 Cookies', callback_data='cbcookies'),
InlineKeyboardButton('📦 LocalStorage', callback_data='cblocalstorage')
],
[
InlineKeyboardButton('📊 Metadata', callback_data='cbmetadata')
]
]
)

0 comments on commit 84f06d6

Please sign in to comment.