[go: nahoru, domu]

Skip to content

Commit

Permalink
优化数据库储存采集数据功能
Browse files Browse the repository at this point in the history
  • Loading branch information
JoeanAmier committed Dec 9, 2023
1 parent 73a4d85 commit 14c5511
Show file tree
Hide file tree
Showing 3 changed files with 50 additions and 25 deletions.
17 changes: 17 additions & 0 deletions docs/TikTokDownloader文档.md
Original file line number Diff line number Diff line change
Expand Up @@ -395,7 +395,24 @@
```

<p>将待下载的账号信息写入配置文件,每个账号对应一个对象/字典,<code>tab</code> 参数设置为 <code>favorite</code> 代表批量下载喜欢作品,支持多账号。</p>
<h3>发布日期限制</h3>

```json
{
"accounts_urls": [
{
"mark": "账号标识",
"url": "账号主页链接",
"tab": "post",
"earliest": "2023/12/1",
"latest": ""
}
]
}
```

<p>如果已经采集某账号的全部发布作品,建议设置 <code>earliest</code> 和 <code>latest</code> 参数以减少后续采集请求次数。</p>
<p>例如:将 <code>earliest</code> 参数设置为 <code>2023/12/1</code>,程序获取账号发布作品数据时,无需获取早于 <code>2023/12/1</code> 的作品数据,可减少请求次数提高运行效率。</p>
<h3>文件储存路径</h3>

```json
Expand Down
41 changes: 21 additions & 20 deletions src/DataExtractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -188,8 +188,8 @@ def classifying_works(self, item: dict, data: SimpleNamespace) -> None:
self.extract_video_info(item, data)

def extract_additional_info(self, item: dict, data: SimpleNamespace):
item["height"] = self.safe_extract(data, "video.height")
item["width"] = self.safe_extract(data, "video.width")
item["height"] = self.safe_extract(data, "video.height", "-1")
item["width"] = self.safe_extract(data, "video.width", "-1")
item["ratio"] = self.safe_extract(data, "video.ratio")
item["share_url"] = self.__clean_share_url(
self.safe_extract(data, "share_url"))
Expand Down Expand Up @@ -291,7 +291,7 @@ def extract_statistics(self, item: dict, data: SimpleNamespace) -> None:
"collect_count",
"share_count",
):
item[i] = str(self.safe_extract(data, i))
item[i] = str(self.safe_extract(data, i, "-1"))

def extract_tags(self, item: dict, data: SimpleNamespace) -> None:
if not (t := self.safe_extract(data, "video_tag")):
Expand All @@ -313,7 +313,7 @@ def extract_account_info(
container.cache["short_id"] = self.safe_extract(data, "short_id")
container.cache["unique_id"] = self.safe_extract(data, "unique_id")
container.cache["signature"] = self.safe_extract(data, "signature")
container.cache["user_age"] = self.safe_extract(data, "user_age")
container.cache["user_age"] = self.safe_extract(data, "user_age", "-1")
self.extract_nickname_info(container, data)

def extract_nickname_info(self,
Expand Down Expand Up @@ -346,6 +346,7 @@ def preprocessing_data(self,
default="无效账号昵称")
title = self.cleaner.filter_name(self.safe_extract(
item, "mix_info.mix_name", f"合集_{str(time())[:10]}"),
inquire=mix,
default="无效合集标题")
mark = self.cleaner.filter_name(
mark, inquire=False, default=title if mix else name)
Expand Down Expand Up @@ -402,11 +403,11 @@ def _extract_comments_data(
container.cache["sticker"] = self.safe_extract(
data, "sticker.static_url.url_list[-1]")
container.cache["digg_count"] = str(
self.safe_extract(data, "digg_count"))
self.safe_extract(data, "digg_count", "-1"))
container.cache["reply_to_reply_id"] = self.safe_extract(
data, "reply_to_reply_id")
container.cache["reply_comment_total"] = str(
self.safe_extract(data, "reply_comment_total", 0))
self.safe_extract(data, "reply_comment_total", "0"))
container.cache["reply_id"] = self.safe_extract(data, "reply_id")
container.cache["cid"] = self.safe_extract(data, "cid")
self.extract_account_info(container, data, "user")
Expand All @@ -418,7 +419,7 @@ def _extract_reply_ids(self, container: SimpleNamespace, data: dict):
container.cache = {
"reply_comment_total": str(
self.safe_extract(
cache, "reply_comment_total", 0)), "cid": self.safe_extract(
cache, "reply_comment_total", "0")), "cid": self.safe_extract(
cache, "cid")}
self._filter_reply_ids(container)
container.all_data.append(data)
Expand Down Expand Up @@ -476,15 +477,15 @@ def _extract_user_data(
container.cache["country"] = self.safe_extract(data, "country")
container.cache["district"] = self.safe_extract(data, "district")
container.cache["favoriting_count"] = str(
self.safe_extract(data, "favoriting_count"))
self.safe_extract(data, "favoriting_count", "-1"))
container.cache["follower_count"] = str(
self.safe_extract(data, "follower_count"))
self.safe_extract(data, "follower_count", "-1"))
container.cache["max_follower_count"] = str(
self.safe_extract(data, "max_follower_count"))
self.safe_extract(data, "max_follower_count", "-1"))
container.cache["following_count"] = str(
self.safe_extract(data, "following_count"))
self.safe_extract(data, "following_count", "-1"))
container.cache["total_favorited"] = str(
self.safe_extract(data, "total_favorited"))
self.safe_extract(data, "total_favorited", "-1"))
container.cache["gender"] = {1: "男", 2: "女"}.get(
self.safe_extract(data, "gender"), "未知")
container.cache["ip_location"] = self.safe_extract(data, "ip_location")
Expand All @@ -496,12 +497,12 @@ def _extract_user_data(
container.cache["uid"] = self.safe_extract(data, "uid")
container.cache["unique_id"] = self.safe_extract(data, "unique_id")
container.cache["user_age"] = str(
self.safe_extract(data, "user_age", -1))
self.safe_extract(data, "user_age", "-1"))
container.cache["cover"] = self.safe_extract(
data, "cover_url[0].url_list[-1]")
container.cache["short_id"] = self.safe_extract(data, "short_id")
container.cache["aweme_count"] = str(
self.safe_extract(data, "aweme_count"))
self.safe_extract(data, "aweme_count", "-1"))
container.cache["verify"] = self.safe_extract(
data, "custom_verify", "无")
container.cache["enterprise"] = self.safe_extract(
Expand Down Expand Up @@ -587,9 +588,9 @@ def _deal_search_user_live(self,
data, "enterprise_verify_reason", "无")
if user:
container.cache["follower_count"] = str(
self.safe_extract(data, "follower_count"))
self.safe_extract(data, "follower_count", "-1"))
container.cache["total_favorited"] = str(
self.safe_extract(data, "total_favorited"))
self.safe_extract(data, "total_favorited", "-1"))
container.cache["unique_id"] = self.safe_extract(data, "unique_id")
container.all_data.append(container.cache)
# else:
Expand Down Expand Up @@ -630,13 +631,13 @@ def hot(self, data: list[dict], recorder) -> list[dict]:

def _deal_hot_data(self, container: list, data: SimpleNamespace):
cache = {
"position": str(self.safe_extract(data, "position", -1)),
"position": str(self.safe_extract(data, "position", "-1")),
"sentence_id": self.safe_extract(data, "sentence_id"),
"word": self.safe_extract(data, "word"),
"video_count": str(self.safe_extract(data, "video_count")),
"video_count": str(self.safe_extract(data, "video_count", "-1")),
"event_time": self.format_date(data, "event_time"),
"view_count": str(self.safe_extract(data, "view_count")),
"hot_value": str(self.safe_extract(data, "hot_value")),
"view_count": str(self.safe_extract(data, "view_count", "-1")),
"hot_value": str(self.safe_extract(data, "hot_value", "-1")),
"cover": self.safe_extract(data, "word_cover.url_list[-1]"),
}
container.append(cache)
Expand Down
17 changes: 12 additions & 5 deletions src/Recorder.py
Original file line number Diff line number Diff line change
Expand Up @@ -242,6 +242,8 @@ def save(self, data, *args, **kwargs):
class SQLLogger(NoneLogger):
"""SQLite保存数据"""
SHEET_NAME = compile(r"[^\u4e00-\u9fa5a-zA-Z0-9_]")
CHECK_SQL = "SELECT COUNT(*) FROM sqlite_master WHERE type='table' AND name=?;"
UPDATE_SQL = "ALTER TABLE ? RENAME TO ?;"

def __init__(
self,
Expand Down Expand Up @@ -292,11 +294,16 @@ def update_sheet(self):
return
mark[-1] = old_sheet
old_sheet = "_".join(mark)
update_sql = f"ALTER TABLE {old_sheet} RENAME TO {new_sheet};"
self.cursor.execute(update_sql)
self.db.commit()
if self.__check_sheet_exists(old_sheet):
self.cursor.execute(self.UPDATE_SQL, (old_sheet, new_sheet))
self.db.commit()
self.name = new_sheet

def __check_sheet_exists(self, sheet: str) -> bool:
self.cursor.execute(self.CHECK_SQL, (sheet,))
exists = self.cursor.fetchone()
return exists[0] > 0

def __clean_sheet_name(self, name: tuple) -> tuple:
return self.__clean_characters(
name[0]), self.__clean_characters(
Expand Down Expand Up @@ -391,8 +398,8 @@ class RecordManager:
"TEXT",
"TEXT",
"TEXT",
"TEXT",
"TEXT",
"INTEGER",
"INTEGER",
"TEXT",
"TEXT",
"TEXT",
Expand Down

0 comments on commit 14c5511

Please sign in to comment.