[go: nahoru, domu]

Skip to content

Commit

Permalink
potential issues dropped issues (#3775)
Browse files Browse the repository at this point in the history
update how comment renders includes comments about dropped files and
potential issues

---------

Co-authored-by: Martin Ye <martin@sweep.dev>
  • Loading branch information
MartinYe1234 and Martin Ye committed May 17, 2024
1 parent 78f1e08 commit 5b8b9fb
Show file tree
Hide file tree
Showing 5 changed files with 58 additions and 34 deletions.
6 changes: 4 additions & 2 deletions sweepai/core/review_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,7 @@ def get_pr_changes(repo: Repository, pr: PullRequest) -> list[PRChange]:
file_diffs = comparison.files

pr_diffs = []
dropped_files = []
for file in tqdm(file_diffs, desc="Annotating diffs"):
file_name = file.filename
diff = file.patch
Expand All @@ -64,6 +65,7 @@ def get_pr_changes(repo: Repository, pr: PullRequest) -> list[PRChange]:

# drop excluded files: for example package-lock.json files
if sweep_config.is_file_excluded(file_name):
dropped_files.append(file_name)
continue

if file.status == "added":
Expand Down Expand Up @@ -95,7 +97,7 @@ def get_pr_changes(repo: Repository, pr: PullRequest) -> list[PRChange]:
pr_diffs.append(
pr_change
)
return pr_diffs
return pr_diffs, dropped_files

def split_diff_into_patches(diff: str) -> list[Patch]:
patches = []
Expand Down Expand Up @@ -319,7 +321,7 @@ def review_code_changes_by_file(self, pr_changes_by_file: dict[str, str], chat_l
if issues_matches:
issues = "\n".join([match.strip() for match in issues_matches])
potential_issues = parse_issues_from_code_review(issues)
code_reviews_by_file[file_name] = CodeReview(file_name=file_name, diff_summary=diff_summary, issues=potential_issues)
code_reviews_by_file[file_name] = CodeReview(file_name=file_name, diff_summary=diff_summary, issues=potential_issues, potential_issues=[])
if chat_logger:
chat_logger.add_chat(
{
Expand Down
1 change: 1 addition & 0 deletions sweepai/dataclasses/codereview.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ class CodeReview:
file_name: str
diff_summary: str
issues: list[CodeReviewIssue]
potential_issues: list[CodeReviewIssue]

@dataclass
class Patch:
Expand Down
33 changes: 21 additions & 12 deletions sweepai/handlers/review_pr.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import copy
import multiprocessing
import os
from time import time
Expand Down Expand Up @@ -88,17 +89,20 @@ def group_vote_review_pr(pr_changes: list[PRChange], formatted_pr_changes_by_fil
all_flattened_embeddings = np.array(all_embeddings)
# note DBSCAN expects a shape with less than or equal to 2 dimensions
try:
db = DBSCAN(eps=0.5, min_samples=3).fit(all_flattened_embeddings)
if all_flattened_embeddings.size:
db = DBSCAN(eps=0.5, min_samples=3).fit(all_flattened_embeddings)
files_to_labels[file_name] = db.labels_
else:
files_to_labels[file_name] = []
except ValueError as e:
logger.error(f"Error with dbscan {e}")
files_to_labels[file_name] = db.labels_


LABEL_THRESHOLD = 4
# get the labels that have a count greater than the threshold
# format: {file_name: {label: [index, ...]}}
files_to_labels_indexes = {}
for file_name, labels in files_to_labels.items():
index_dict = {}
index_dict: dict[str, list[int]] = {}
for i, v in enumerate(labels):
key = str(v)
if key not in index_dict:
Expand All @@ -109,18 +113,23 @@ def group_vote_review_pr(pr_changes: list[PRChange], formatted_pr_changes_by_fil
# create the final code_reviews_by_file
for file_name, labels_dict in files_to_labels_indexes.items():
# pick first one as diff summary doesnt really matter
final_code_review: CodeReview = code_reviews_by_file[0][file_name]
final_code_review: CodeReview = copy.deepcopy(code_reviews_by_file[0][file_name])
final_code_review.issues = []
final_code_review.potential_issues = []
final_issues = []
potential_issues = []
for label, indexes in labels_dict.items():
final_issues = []
# -1 is considered as noise
if len(indexes) >= LABEL_THRESHOLD and label != "-1":
# add to final issues, first issue - TODO use similarity score of all issues against each other
final_issues.append(files_to_issues[file_name][indexes[0]])
final_code_review.issues = final_issues

majority_code_review_by_file[file_name] = final_code_review

# get potential issues which are one below the label_threshold
if len(indexes) == LABEL_THRESHOLD - 1 and label != "-1":
# add to final issues, first issue - TODO use similarity score of all issues against each other
potential_issues.append(files_to_issues[file_name][indexes[0]])
final_code_review.issues = final_issues
final_code_review.potential_issues = potential_issues
majority_code_review_by_file[file_name] = copy.deepcopy(final_code_review)
return majority_code_review_by_file

def review_pr(username: str, pr: PullRequest, repository: Repository, installation_id: int, tracking_id: str | None = None):
Expand Down Expand Up @@ -158,10 +167,10 @@ def review_pr(username: str, pr: PullRequest, repository: Repository, installati
return {"success": False, "reason": "PR is closed"}
# handle creating comments on the pr to tell the user we are going to begin reviewing the pr
_comment_id = create_update_review_pr_comment(pr)
pr_changes = get_pr_changes(repository, pr)
pr_changes, dropped_files = get_pr_changes(repository, pr)
formatted_pr_changes_by_file = format_pr_changes_by_file(pr_changes)
code_review_by_file = group_vote_review_pr(pr_changes, formatted_pr_changes_by_file, multiprocess=True, chat_logger=chat_logger)
_comment_id = create_update_review_pr_comment(pr, code_review_by_file=code_review_by_file)
_comment_id = create_update_review_pr_comment(pr, code_review_by_file=code_review_by_file, dropped_files=dropped_files)
except Exception as e:
posthog.capture(
username,
Expand Down
50 changes: 31 additions & 19 deletions sweepai/utils/ticket_rendering_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -615,23 +615,27 @@ def parse_issues_from_code_review(issue_string: str):
return potential_issues

# converts the list of issues inside a code_review into markdown text to display in a github comment
def render_code_review_issues(pr: PullRequest, code_review: CodeReview):
def render_code_review_issues(pr: PullRequest, code_review: CodeReview, issue_type: str = ""):
files_to_blobs = {file.filename: file.blob_url for file in list(pr.get_files())}
# generate the diff urls
files_to_diffs = {}
for file_name, _ in files_to_blobs.items():
sha_256 = hashlib.sha256(file_name.encode('utf-8')).hexdigest()
files_to_diffs[file_name] = f"{pr.html_url}/files#diff-{sha_256}"
potential_issues = ""
for issue in code_review.issues:
if issue.start_line == issue.end_line:
issue_blob_url = f"{files_to_blobs[code_review.file_name]}#L{issue.start_line}"
issue_diff_url = f"{files_to_diffs[code_review.file_name]}R{issue.start_line}"
else:
issue_blob_url = f"{files_to_blobs[code_review.file_name]}#L{issue.start_line}-L{issue.end_line}"
issue_diff_url = f"{files_to_diffs[code_review.file_name]}R{issue.start_line}-R{issue.end_line}"
potential_issues += f"<li>{issue.issue_description}</li>\n\n{issue_blob_url}\n[View Diff]({issue_diff_url})"
return potential_issues
code_issues = code_review.issues
if issue_type == "potential":
code_issues = code_review.potential_issues
code_issues_string = ""
for issue in code_issues:
if code_review.file_name in files_to_blobs:
if issue.start_line == issue.end_line:
issue_blob_url = f"{files_to_blobs[code_review.file_name]}#L{issue.start_line}"
issue_diff_url = f"{files_to_diffs[code_review.file_name]}R{issue.start_line}"
else:
issue_blob_url = f"{files_to_blobs[code_review.file_name]}#L{issue.start_line}-L{issue.end_line}"
issue_diff_url = f"{files_to_diffs[code_review.file_name]}R{issue.start_line}-R{issue.end_line}"
code_issues_string += f"<li>{issue.issue_description}</li>\n\n{issue_blob_url}\n[View Diff]({issue_diff_url})"
return code_issues_string

def escape_html(text: str) -> str:
return text.replace('<', '&lt;').replace('>', '&gt;')
Expand Down Expand Up @@ -670,24 +674,32 @@ def format_code_sections(text: str) -> str:
return '<code>'.join(parts)

# turns code_review_by_file into markdown string
def render_pr_review_by_file(pr: PullRequest, code_review_by_file: dict[str, CodeReview]) -> str:
def render_pr_review_by_file(pr: PullRequest, code_review_by_file: dict[str, CodeReview], dropped_files: list[str] = []) -> str:
body = f"{SWEEP_PR_REVIEW_HEADER}\n"
reviewed_files = ""
for file_name, code_review in code_review_by_file.items():
potential_issues = code_review.issues
sweep_issues = code_review.issues
potential_issues = code_review.potential_issues
reviewed_files += f"""<details open>
<summary>{file_name}</summary>
<p>{format_code_sections(code_review.diff_summary)}</p>"""
if sweep_issues:
sweep_issues_string = render_code_review_issues(pr, code_review)
reviewed_files += f"<p><strong>Sweep Found These Issues</strong></p><ul>{format_code_sections(sweep_issues_string)}</ul>"
if potential_issues:
potential_issues_string = render_code_review_issues(pr, code_review)
reviewed_files += f"<p><strong>Sweep Found These Issues</strong></p><ul>{format_code_sections(potential_issues_string)}</ul></details><hr>"
else:
reviewed_files += "<p>No issues found with the reviewed changes</p></details><hr>"
potential_issues_string = render_code_review_issues(pr, code_review, issue_type="potential")
reviewed_files += f"<details><summary><strong>Potential Issues</strong></summary><p>Sweep isn't 100% sure if the following are issues or not but they may be worth taking a look at.</p><ul>{format_code_sections(potential_issues_string)}</ul></details>"
reviewed_files += "</details><hr>"
if len(dropped_files) == 1:
reviewed_files += f"<p>{dropped_files[0]} was not reviewed because our filter identified it as typically a non-human-readable or less important file (e.g., dist files, package.json, images). If this is an error, please let us know.</p>"
elif len(dropped_files) > 1:
dropped_files_string = "".join([f"<li>{file}</li>" for file in dropped_files])
reviewed_files += f"<p>The following files were not reviewed because our filter identified them as typically non-human-readable or less important files (e.g., dist files, package.json, images). If this is an error, please let us know.</p><ul>{dropped_files_string}</ul>"
return body + reviewed_files

# handles the creation or update of the Sweep comment letting the user know that Sweep is reviewing a pr
# returns the comment_id
def create_update_review_pr_comment(pr: PullRequest, code_review_by_file: dict[str, CodeReview] | None = None) -> int:
def create_update_review_pr_comment(pr: PullRequest, code_review_by_file: dict[str, CodeReview] | None = None, dropped_files: list[str] = []) -> int:
comment_id = -1
sweep_comment = None
# comments that appear in the github ui in the conversation tab are considered issue comments
Expand All @@ -706,7 +718,7 @@ def create_update_review_pr_comment(pr: PullRequest, code_review_by_file: dict[s

# update body of sweep_comment
if code_review_by_file:
rendered_pr_review = render_pr_review_by_file(pr, code_review_by_file)
rendered_pr_review = render_pr_review_by_file(pr, code_review_by_file, dropped_files=dropped_files)
sweep_comment.edit(rendered_pr_review)
comment_id = sweep_comment.id
return comment_id
Expand Down
2 changes: 1 addition & 1 deletion tests/test_code_review.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ def temp_pr_changes(url):
repo = g.get_repo(repo_name)
pr = repo.get_pull(pr_number)
# Fetch the diff
pr_changes = get_pr_changes(repo, pr)
pr_changes, _ = get_pr_changes(repo, pr)
return pr_changes

pr_changes = temp_pr_changes(url)
Expand Down

0 comments on commit 5b8b9fb

Please sign in to comment.