-
Notifications
You must be signed in to change notification settings - Fork 9
/
crawler-modb.py
82 lines (63 loc) · 3.09 KB
/
crawler-modb.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
import time
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.action_chains import ActionChains
from selenium.common.exceptions import TimeoutException
import re
from bs4 import BeautifulSoup
# Create a new Chrome browser instance and navigate to website A
s = Service('/opt/homebrew/Caskroom/chromedriver/112.0.5615.49/chromedriver')
driver = webdriver.Chrome(service=s)
driver.get("https://www.modb.pro/u/15144")
# Wait for the element associated with the tab B to become clickable
b_tab = WebDriverWait(driver, 10).until(
EC.element_to_be_clickable((By.XPATH, "/html/body/div/div/section/div[1]/div[2]/div[2]/div[1]/div[1]/div[2]/div"))
)
# 切换到 tab,这里的 tab 是文章选卡
b_tab.click()
time.sleep(5)
def scroll_to_bottom(driver, css_selector):
# 如果数据加载不出来,调大 SCROLL_PAUSE_TIME,且 SCROLL_PAUSE_TIME 数值要比下面 WebDriverWait(driver, 60) 中的数值大
SCROLL_PAUSE_TIME = 60
last_loaded_count = 0
while True:
elements = driver.find_elements(By.CSS_SELECTOR, css_selector)
print(f"Current loaded elements count: {len(elements)}") # Debug info
if len(elements) > last_loaded_count:
last_loaded_count = len(elements)
driver.execute_script("arguments[0].scrollIntoView();", elements[-1])
time.sleep(SCROLL_PAUSE_TIME)
# 如果数据加载不出来,调大下面的 60
WebDriverWait(driver, 60).until(EC.visibility_of(elements[-1])) # Add explicit wait for the last element to be visible
else:
break
# ... (the rest of the code remains the same)
# Scroll to the bottom, until all content is loaded
scroll_to_bottom(driver, ".b-border-item.font14.flex.between.knowledge-item.pdlr20.mt20")
# Extract the HTML source of the page
html_content = driver.page_source
# 使用BeautifulSoup解析HTML内容
soup = BeautifulSoup(html_content, 'html.parser')
modiv = soup.find("div", {"id": "actionH"})
read_num = 0
for span_with_views_list in modiv.find_all("span", class_="views font12"):
# 被注释掉的这条语句主要用来调试是否抓取了你想要的 td 数据;在 macOS 下选中命令用 command + / 即可取消注释;
# print(span_with_views_list)
for span_with_views in span_with_views_list:
# 这个也是用来调试的打印程序;
# print(td_with_views)
if span_with_views:
# 通过标题属性抓取浏览量
# views_title = td_with_views.find("span", {"class": "number"})["title"]
# 使用正则表达式提取数字
views_num = int(re.sub("[^0-9]", "", span_with_views.text))
read_num = read_num + views_num
# print(views_num)
else:
print("无法找到包含'num views'的td")
print("总阅读量是" + str(read_num))
# 关闭webdriver
driver.quit()