potential issues dropped issues (#3775)

update how comment renders includes comments about dropped files and potential issues --------- Co-authored-by: Martin Ye <martin@sweep.dev>
sweepai · May 17, 2024 · 5b8b9fb · 5b8b9fb
1 parent 78f1e08
commit 5b8b9fb
Show file tree

Hide file tree

Showing 5 changed files with 58 additions and 34 deletions.
diff --git a/sweepai/core/review_utils.py b/sweepai/core/review_utils.py
@@ -56,6 +56,7 @@ def get_pr_changes(repo: Repository, pr: PullRequest) -> list[PRChange]:
     file_diffs = comparison.files
 
     pr_diffs = []
+    dropped_files = []
     for file in tqdm(file_diffs, desc="Annotating diffs"):
         file_name = file.filename
         diff = file.patch
@@ -64,6 +65,7 @@ def get_pr_changes(repo: Repository, pr: PullRequest) -> list[PRChange]:
 
         # drop excluded files: for example package-lock.json files
         if sweep_config.is_file_excluded(file_name):
+            dropped_files.append(file_name)
             continue
 
         if file.status == "added":
@@ -95,7 +97,7 @@ def get_pr_changes(repo: Repository, pr: PullRequest) -> list[PRChange]:
         pr_diffs.append(
             pr_change
         )
-    return pr_diffs
+    return pr_diffs, dropped_files
 
 def split_diff_into_patches(diff: str) -> list[Patch]:
     patches = []
@@ -319,7 +321,7 @@ def review_code_changes_by_file(self, pr_changes_by_file: dict[str, str], chat_l
             if issues_matches:
                 issues = "\n".join([match.strip() for match in issues_matches])
             potential_issues = parse_issues_from_code_review(issues)
-            code_reviews_by_file[file_name] = CodeReview(file_name=file_name, diff_summary=diff_summary, issues=potential_issues)
+            code_reviews_by_file[file_name] = CodeReview(file_name=file_name, diff_summary=diff_summary, issues=potential_issues, potential_issues=[])
             if chat_logger:
                 chat_logger.add_chat(
                     {

diff --git a/sweepai/dataclasses/codereview.py b/sweepai/dataclasses/codereview.py
@@ -12,6 +12,7 @@ class CodeReview:
     file_name: str
     diff_summary: str
     issues: list[CodeReviewIssue]
+    potential_issues: list[CodeReviewIssue]
 
 @dataclass
 class Patch:

diff --git a/sweepai/handlers/review_pr.py b/sweepai/handlers/review_pr.py
@@ -1,3 +1,4 @@
+import copy
 import multiprocessing
 import os
 from time import time
@@ -88,17 +89,20 @@ def group_vote_review_pr(pr_changes: list[PRChange], formatted_pr_changes_by_fil
         all_flattened_embeddings = np.array(all_embeddings)
         # note DBSCAN expects a shape with less than or equal to 2 dimensions
         try:
-            db = DBSCAN(eps=0.5, min_samples=3).fit(all_flattened_embeddings)
+            if all_flattened_embeddings.size:
+                db = DBSCAN(eps=0.5, min_samples=3).fit(all_flattened_embeddings)
+                files_to_labels[file_name] = db.labels_
+            else:
+                files_to_labels[file_name] = []
         except ValueError as e:
             logger.error(f"Error with dbscan {e}")
-        files_to_labels[file_name] = db.labels_
-
+
     LABEL_THRESHOLD = 4
     # get the labels that have a count greater than the threshold
     # format: {file_name: {label: [index, ...]}}
     files_to_labels_indexes = {}
     for file_name, labels in files_to_labels.items():
-        index_dict = {}
+        index_dict: dict[str, list[int]] = {}
         for i, v in enumerate(labels):
             key = str(v)
             if key not in index_dict:
@@ -109,18 +113,23 @@ def group_vote_review_pr(pr_changes: list[PRChange], formatted_pr_changes_by_fil
     # create the final code_reviews_by_file
     for file_name, labels_dict in files_to_labels_indexes.items():
         # pick first one as diff summary doesnt really matter
-        final_code_review: CodeReview = code_reviews_by_file[0][file_name]
+        final_code_review: CodeReview = copy.deepcopy(code_reviews_by_file[0][file_name])
         final_code_review.issues = []
+        final_code_review.potential_issues = []
+        final_issues = []
+        potential_issues = []
         for label, indexes in labels_dict.items():
-            final_issues = []
             # -1 is considered as noise
             if len(indexes) >= LABEL_THRESHOLD and label != "-1":
                 # add to final issues, first issue - TODO use similarity score of all issues against each other
                 final_issues.append(files_to_issues[file_name][indexes[0]])
-            final_code_review.issues = final_issues
-
-        majority_code_review_by_file[file_name] = final_code_review
-
+            # get potential issues which are one below the label_threshold
+            if len(indexes) == LABEL_THRESHOLD - 1 and label != "-1":
+                # add to final issues, first issue - TODO use similarity score of all issues against each other
+                potential_issues.append(files_to_issues[file_name][indexes[0]])
+        final_code_review.issues = final_issues
+        final_code_review.potential_issues = potential_issues
+        majority_code_review_by_file[file_name] = copy.deepcopy(final_code_review)
     return majority_code_review_by_file
 
 def review_pr(username: str, pr: PullRequest, repository: Repository, installation_id: int, tracking_id: str | None = None):
@@ -158,10 +167,10 @@ def review_pr(username: str, pr: PullRequest, repository: Repository, installati
                 return {"success": False, "reason": "PR is closed"}
             # handle creating comments on the pr to tell the user we are going to begin reviewing the pr
             _comment_id = create_update_review_pr_comment(pr)
-            pr_changes = get_pr_changes(repository, pr)
+            pr_changes, dropped_files = get_pr_changes(repository, pr)
             formatted_pr_changes_by_file = format_pr_changes_by_file(pr_changes)
             code_review_by_file = group_vote_review_pr(pr_changes, formatted_pr_changes_by_file, multiprocess=True, chat_logger=chat_logger)
-            _comment_id = create_update_review_pr_comment(pr, code_review_by_file=code_review_by_file)
+            _comment_id = create_update_review_pr_comment(pr, code_review_by_file=code_review_by_file, dropped_files=dropped_files)
         except Exception as e:
             posthog.capture(
                 username,

diff --git a/sweepai/utils/ticket_rendering_utils.py b/sweepai/utils/ticket_rendering_utils.py
@@ -615,23 +615,27 @@ def parse_issues_from_code_review(issue_string: str):
     return potential_issues
 
 # converts the list of issues inside a code_review into markdown text to display in a github comment
-def render_code_review_issues(pr: PullRequest, code_review: CodeReview):
+def render_code_review_issues(pr: PullRequest, code_review: CodeReview, issue_type: str = ""):
     files_to_blobs = {file.filename: file.blob_url for file in list(pr.get_files())}
     # generate the diff urls
     files_to_diffs = {}
     for file_name, _ in files_to_blobs.items():
         sha_256 = hashlib.sha256(file_name.encode('utf-8')).hexdigest()
         files_to_diffs[file_name] = f"{pr.html_url}/files#diff-{sha_256}"
-    potential_issues = ""
-    for issue in code_review.issues:
-        if issue.start_line == issue.end_line:
-            issue_blob_url = f"{files_to_blobs[code_review.file_name]}#L{issue.start_line}"
-            issue_diff_url = f"{files_to_diffs[code_review.file_name]}R{issue.start_line}"
-        else:
-            issue_blob_url = f"{files_to_blobs[code_review.file_name]}#L{issue.start_line}-L{issue.end_line}"
-            issue_diff_url = f"{files_to_diffs[code_review.file_name]}R{issue.start_line}-R{issue.end_line}"
-        potential_issues += f"<li>{issue.issue_description}</li>\n\n{issue_blob_url}\n[View Diff]({issue_diff_url})"
-    return potential_issues
+    code_issues = code_review.issues
+    if issue_type == "potential":
+        code_issues = code_review.potential_issues
+    code_issues_string = ""
+    for issue in code_issues:
+        if code_review.file_name in files_to_blobs:
+            if issue.start_line == issue.end_line:
+                issue_blob_url = f"{files_to_blobs[code_review.file_name]}#L{issue.start_line}"
+                issue_diff_url = f"{files_to_diffs[code_review.file_name]}R{issue.start_line}"
+            else:
+                issue_blob_url = f"{files_to_blobs[code_review.file_name]}#L{issue.start_line}-L{issue.end_line}"
+                issue_diff_url = f"{files_to_diffs[code_review.file_name]}R{issue.start_line}-R{issue.end_line}"
+            code_issues_string += f"<li>{issue.issue_description}</li>\n\n{issue_blob_url}\n[View Diff]({issue_diff_url})"
+    return code_issues_string
 
 def escape_html(text: str) -> str:
     return text.replace('<', '&lt;').replace('>', '&gt;')
@@ -670,24 +674,32 @@ def format_code_sections(text: str) -> str:
     return '<code>'.join(parts)
 
 # turns code_review_by_file into markdown string
-def render_pr_review_by_file(pr: PullRequest, code_review_by_file: dict[str, CodeReview]) -> str:
+def render_pr_review_by_file(pr: PullRequest, code_review_by_file: dict[str, CodeReview], dropped_files: list[str] = []) -> str:
     body = f"{SWEEP_PR_REVIEW_HEADER}\n"
     reviewed_files = ""
     for file_name, code_review in code_review_by_file.items():
-        potential_issues = code_review.issues
+        sweep_issues = code_review.issues
+        potential_issues = code_review.potential_issues
         reviewed_files += f"""<details open>
 <summary>{file_name}</summary>
 <p>{format_code_sections(code_review.diff_summary)}</p>"""
+        if sweep_issues:
+            sweep_issues_string = render_code_review_issues(pr, code_review)
+            reviewed_files += f"<p><strong>Sweep Found These Issues</strong></p><ul>{format_code_sections(sweep_issues_string)}</ul>"
         if potential_issues:
-            potential_issues_string = render_code_review_issues(pr, code_review)
-            reviewed_files += f"<p><strong>Sweep Found These Issues</strong></p><ul>{format_code_sections(potential_issues_string)}</ul></details><hr>"
-        else:
-            reviewed_files += "<p>No issues found with the reviewed changes</p></details><hr>"
+            potential_issues_string = render_code_review_issues(pr, code_review, issue_type="potential")
+            reviewed_files += f"<details><summary><strong>Potential Issues</strong></summary><p>Sweep isn't 100% sure if the following are issues or not but they may be worth taking a look at.</p><ul>{format_code_sections(potential_issues_string)}</ul></details>"
+        reviewed_files += "</details><hr>"
+    if len(dropped_files) == 1:
+        reviewed_files += f"<p>{dropped_files[0]} was not reviewed because our filter identified it as typically a non-human-readable or less important file (e.g., dist files, package.json, images). If this is an error, please let us know.</p>"
+    elif len(dropped_files) > 1:
+        dropped_files_string = "".join([f"<li>{file}</li>" for file in dropped_files])
+        reviewed_files += f"<p>The following files were not reviewed because our filter identified them as typically non-human-readable or less important files (e.g., dist files, package.json, images). If this is an error, please let us know.</p><ul>{dropped_files_string}</ul>"
     return body + reviewed_files
 
 # handles the creation or update of the Sweep comment letting the user know that Sweep is reviewing a pr
 # returns the comment_id
-def create_update_review_pr_comment(pr: PullRequest, code_review_by_file: dict[str, CodeReview] | None = None) -> int:
+def create_update_review_pr_comment(pr: PullRequest, code_review_by_file: dict[str, CodeReview] | None = None, dropped_files: list[str] = []) -> int:
     comment_id = -1
     sweep_comment = None
     # comments that appear in the github ui in the conversation tab are considered issue comments
@@ -706,7 +718,7 @@ def create_update_review_pr_comment(pr: PullRequest, code_review_by_file: dict[s
 
     # update body of sweep_comment
     if code_review_by_file:
-        rendered_pr_review = render_pr_review_by_file(pr, code_review_by_file)
+        rendered_pr_review = render_pr_review_by_file(pr, code_review_by_file, dropped_files=dropped_files)
         sweep_comment.edit(rendered_pr_review)
     comment_id = sweep_comment.id
     return comment_id

diff --git a/tests/test_code_review.py b/tests/test_code_review.py
@@ -25,7 +25,7 @@ def temp_pr_changes(url):
     repo = g.get_repo(repo_name)
     pr = repo.get_pull(pr_number)
     # Fetch the diff
-    pr_changes = get_pr_changes(repo, pr)
+    pr_changes, _ = get_pr_changes(repo, pr)
     return pr_changes
 
 pr_changes = temp_pr_changes(url)