Feat/sweep review improvements 0618 (#4053)

improvements to review --------- Co-authored-by: wwzeng1 <william@sweep.dev> Co-authored-by: Martin Ye <martin@sweep.dev> Co-authored-by: MartinYe1234 <52641447+MartinYe1234@users.noreply.github.com>
sweepai · Jun 24, 2024 · 923fd9c · 923fd9c
1 parent a6d27f5
commit 923fd9c
Show file tree

Hide file tree

Showing 3 changed files with 149 additions and 15 deletions.
diff --git a/sweepai/config/client.py b/sweepai/config/client.py
@@ -332,7 +332,6 @@ def is_file_bad(self, file_name: str, repo_dir: str) -> tuple[bool, str]:
         """
         Uses github-linguist to determine if a file is "good" or not
         """
-        vendored = False
         generated = False
         try:
             query = ["github-linguist", file_name, "-j"]
@@ -345,12 +344,9 @@ def is_file_bad(self, file_name: str, repo_dir: str) -> tuple[bool, str]:
             )
             result = json.loads(response.stdout)
             type = result[file_name]["type"]
-            vendored = result[file_name]['vendored']
             generated = result[file_name]['generated']
             language = result[file_name]["language"]
             # if there is a string of numbers in the file name that is more than 4 characters long, it is likely autogenerated
-            if vendored:
-                return True, "This file is likely a vendored file."
             if generated:
                 return True, "This file is likely an autogenerated file."
             if type != "Text":

diff --git a/sweepai/core/review_prompts.py b/sweepai/core/review_prompts.py
@@ -10,6 +10,14 @@
 It is your job to make sure that the pull request changes do not violate any of the given rules.
 """
 
+system_prompt_edge_case_question_creation = """You are a careful and smart tech lead that wants to avoid production issues. You will be analyzing a set of diffs representing a pull request made to a piece of source code.
+It is your job to ask a series of detailed and exhaustive set of questions which will then be passed on to the original engineer. The point of this is to make sure that after all the questions have been answered, it will be clear whether or not there are any production issues with the pull request. You are to tailor the questions you create to the code changes provided.
+"""
+
+system_prompt_edge_case_answer = """You are a careful and smart tech lead that wants to avoid production issues. You will be analyzing a set of diffs representing a pull request made to a piece of source code.
+It is your job to answer a series of questions presented to you by a previous engineer. The questions are meant to help identify any potential production issues with the pull request. You are to answer these questions and using those answers determine if there are any production issues with the pull request.
+"""
+
 system_prompt_identify_new_functions = """You are an expert programmer with a keen eye for detail, assigned to analyze a series of code patches in a pull request. Your primary responsibility is to meticulously identify all newly created functions within the code."""
 
 system_prompt_identify_repeats = """You are a proficient programmer tasked with identifying useless utility functions in a codebase. Your job is to identify any useless function definitions.
@@ -66,6 +74,7 @@
     2d. Do not make assumptions about existing functions or code. Assume all existing code and system configurations are correct and functioning as intended.
     2e. Do not raise issues over functions being called without error handling. You do not have enough context to determine if the functions are being called correctly.
     2f. You must take into account the intentions of the pull request when identifying issues. Are the the code changes in line with the intentions of the pull request? If the answer is yes then this is not an issue.
+    2g. Do not raise issues related to system configuration/resources. For example, do not mention anything about potential resource exhaustion.
 
 Answer each of the above questions in step 2 in the following format:
 <issue_identification>
@@ -108,6 +117,82 @@
 </rules_analysis>
 """
 
+user_prompt_edge_case_question_creation_format = """
+# Code Review
+Here are the changes in the pull request changes given in diff format:
+<changes>
+{diff}
+</changes>
+
+# Instructions
+1. Summarize the code changes
+Review each change individually, examining the code changes line-by-line. Then review all the changes at once in order to get an idea of how each individual change combines together and what the overall effect of the pull request is.
+
+Respond in the format below, this is mandatory:
+<thoughts>
+{{Provide your thoughts on the code changes above, providing a summary and how each change ties together with the other changes.}}
+</thoughts>
+
+2. Create the questions
+You will be given following Cases in which you much generate a series of questions. For each case you will be given guidelines on how to create these questions most effectively. You are to tailor the questions you create to the code changes provided. Make sure to explicitly reference variable names and entities related to the questions and even line numbers if necessary. You MUST create the questions in a way such that an affirmative answer "yes" means that there is an issue with the pull request. That means you should phrase you questions in a way such that they are asking if this issue exists or not. ALWAYS begin the question by asking "Yes or No?"
+<cases>
+<case_1>
+Case 1 - Create questions related to concurrency changes. If there are changes related to concurrency in the pull request you are to generate questions related to concurrency changes.  If not, you may skip this part and go to the next Case. The questions you generate should be an exhaustive list of questions meant to help identify issues with the code changes.
+<guidelines_for_case_1>
+1. Create questions asking if the variables are mutated unintentionally in any way.
+2. When there are variables such as arrays where the order of the data matters, ask questions about whether or not the order of these variables are mutated.
+3. Be strict with how you ask your questions. Do not use words like "potentially" or "possibly". If you are asking if an issue exists, ask in a direct manner. Example: Instead of asking "Is there any chance that there is a deadlock" instead ask "Is there a deadlock".
+4. Phrase the questions to make sure that if the answer is "Yes" then that means there is an issue with the pull request. Basically you are asking if an issue exists or not.
+</guidelines_for_case_1>
+</case_1>
+</cases>
+
+Respond with ALL your created questions for every case in the following format below, this is mandatory:
+<created_questions>
+1. Yes or No? Question 1 goes here...
+2. Yes or No? Question 2 goes here...
+...
+n. Yes or No? Question n goes here...
+</created_questions>
+"""
+
+user_prompt_edge_case_question_answer_format = """
+# Code Review
+Here are the changes in the pull request changes given in diff format:
+<changes>
+{diff}
+</changes>
+
+# Questions to Answer
+<questions_to_answer>
+{questions}
+</questions_to_answer>
+
+# Instructions
+1. Summarize the code changes
+Review each change individually, examining the code changes line-by-line. Then review all the changes at once in order to get an idea of how each individual change combines together and what the overall effect of the pull request is.
+
+Respond in the format below, this is mandatory:
+<thoughts>
+{{Provide your thoughts on the code changes above, providing a summary and how each change ties together with the other changes.}}
+</thoughts>
+
+2. Answer the # Questions to Answer
+Now you must answer each question, justifying each answer as you go and giving an example from the code changes to support your answer.
+First repeat the question verbatim then answer the question in the following format below, this is mandatory:
+<questions_and_answers>
+1. Question 1 repeated verbatim. Answer: Question 1 answer goes here...
+2. Question 2 repeated verbatim. Answer: Question 2 answer goes here...
+...
+n. Question n repeated verbatim. Answer: Question n answer goes here...
+</questions_and_answers>
+
+3. Identify Issues
+Based on your answers to the questions above you are now to identify any potential issues that the code changes may introduce. 
+Read through each question and answer pair and determine if an issue needs to be raised. 
+If there was not enough information to answer a question IT IS NOT AN ISSUE! DO NOT INCLUDE IT AS AN ISSUE AND MOVE ON TO THE NEXT QUESTION. 
+"""
+
 user_prompt_issue_output_format = """
 [FORMAT]
 Finally, format the found issues and root causes using the following XML tags. Each issue description should be a single sentence. 
@@ -170,14 +255,17 @@
 
 # Instructions
 1. Analyze each identified potential issue for the file(s) {file_names}
-    1a. Review each identified issue individually, formulate 3-5 questions to answer in order to determine the severity of the issue.
-    1b. Answer the questions formulated in step 1a. In order to accomplish this examine the referenced lines of code in the provided code files above.
-    1c. Answer the following questions in addition to the ones you generated in steps 1a. Is this reported issue accurate (double check that the previous reviewer was not mistaken, YOU MUST include the corresponding patch for proof)? If the answer to this question is no, then the issue is not severe. 
-    1d. Determine whether or not this issue is severe enough to prevent the pull request from being merged or not. For example, any potential logical error is considered severe.
-    1e. Take note of some common issues: Accidently removing or commenting out lines of code that has functional utility. In this case double check if this change was intentional or accidental.
+Review each identified issue individually, formulate 3 questions to answer in order to determine the severity of the issue.
+    1a. First formulated question and answer. In order to accomplish this examine the referenced lines of code in the provided code files above.
+    1b. Second formulated question and answer. In order to accomplish this examine the referenced lines of code in the provided code files above.
+    1c. Third formulated question and answer. In order to accomplish this examine the referenced lines of code in the provided code files above.
+    1d. Is this reported issue accurate (double check that the previous reviewer was not mistaken, YOU MUST include the corresponding patch for proof). If the answer to this question is no, then the issue is not severe. 
+    1e. Is there accidentally removed or commented out lines of code that has functional utility. In this case double check if this change was intentional or accidental.
     1f. Take into account the intentions of the pull request when identifying issues. Are the code changes in line with the intentions of the pull request? If the answer is yes then this is not an issue.
-    1g. Is this issue related to potential secuirty vulnerabilities? If yes, then this issue is NOT severe and should not be included.
-    1h. Finally was this issue already raised in a comment thread? If yes, then this issue has already been identified and you should not raise it again. You must also provide proof by referencing the exact comment where this issue was raised.
+    1g. Is this issue related to potential security vulnerabilities? If yes, then this issue is NOT severe and should not be included.
+    1h. Does this issue have the potential to cause a production crash or introduce a logical error? If yes, then this issue is severe and should be included in the final list of issues.
+    1i. Finally was this issue already raised in a comment thread? If yes, then this issue has already been identified and you should not raise it again. You must also provide proof by referencing the exact comment where this issue was raised.
+    1j. Is this issue severe enough based on the questions and answers above to prevent the pull request from being merged? Any issue that will cause production to crash or introduce a logical error is considered severe.
 """
 user_prompt_review_special_rules = """
 In addition to all the above questions you must answer in step 1, the following rules also apply to this file as defined by the user themselves:
@@ -200,14 +288,14 @@
 1a. Answer for question 1a...
 1b. ...
 ...
-1g. Answer for question 1g...}}
+1j. Answer for question 1j...}}
 </thinking>
 ...
 </thoughts>"""
 
 user_prompt_review_decisions = """
 2. Decide which issues to keep
-    2a. Based on your analysis in step 1, now decide which issues to keep and drop. Only include severe issues.
+    2a. Based on your analysis in step 1, now decide which issues to keep and drop. Only include severe issues. To determine if an issue is severe examine the questions and answers you provided above.
     2b. After choosing to keep an issue you are to respond in the following format:
 <severe_issues>
 <issue>

diff --git a/sweepai/core/review_utils.py b/sweepai/core/review_utils.py
@@ -17,6 +17,8 @@
 from sweepai.core.vector_db import cosine_similarity, embed_text_array
 from sweepai.core.review_prompts import (
     system_prompt,
+    system_prompt_edge_case_question_creation,
+    system_prompt_edge_case_answer,
     system_prompt_pr_summary,
     system_prompt_review,
     system_prompt_special_rules,
@@ -31,6 +33,8 @@
     user_prompt_review_decisions,
     user_prompt_identify_new_functions,
     user_prompt_identify_repeats,
+    user_prompt_edge_case_question_creation_format,
+    user_prompt_edge_case_question_answer_format,
     patch_format,
     patch_format_without_annotations,
     patches_for_file_format,
@@ -392,7 +396,7 @@ def format_all_pr_changes_by_groups(
                 formatted_pr_changes_by_groups[group.group_name] = group
     return formatted_pr_changes_by_groups
 
-CLAUDE_MODEL = "claude-3-opus-20240229"
+CLAUDE_MODEL = "claude-3-5-sonnet-20240620"
 
 class PRReviewBot(ChatGPT):
     # check if there are special .md rules to see if there are additional rules to check for
@@ -533,6 +537,52 @@ def review_code_changes_by_file(
                             seed=seed,
                         )
                         code_review_response += special_rules_response
+            # add a final check for edge cases
+            self.messages = [
+                Message(
+                    role="system",
+                    content=system_prompt_edge_case_question_creation,
+                )
+            ]
+
+            formatted_user_prompt_edge_case_question_creation = user_prompt_edge_case_question_creation_format.format(
+                diff=rendered_changes
+            )
+
+            edge_case_questions_response = self.chat_anthropic(
+                content=formatted_user_prompt_edge_case_question_creation,
+                temperature=0.1,
+                model=CLAUDE_MODEL,
+                use_openai=True,
+                seed=seed
+            )
+
+            created_questions_dict, failed, _ = extract_object_fields_from_string(
+                edge_case_questions_response, ["created_questions"]
+            )
+            if not failed:
+                created_questions = created_questions_dict["created_questions"]
+                # now get llm to answer the questions it created
+                self.messages = [
+                    Message(
+                        role="system",
+                        content=system_prompt_edge_case_answer,
+                    )
+                ]
+                formatted_user_prompt_edge_case_answer = user_prompt_edge_case_question_answer_format.format(
+                    diff=rendered_changes,
+                    questions=created_questions
+                )
+                formatted_user_prompt_edge_case_answer += user_prompt_issue_output_format
+                edge_case_answer_response = self.chat_anthropic(
+                    content=formatted_user_prompt_edge_case_answer,
+                    temperature=0.1,
+                    model=CLAUDE_MODEL,
+                    use_openai=True,
+                    seed=seed
+                )
+                code_review_response += edge_case_answer_response
+
             diff_summary = ""
             diff_summary_pattern = r"<diff_summary>(.*?)</diff_summary>"
             diff_summary_matches = re.findall(diff_summary_pattern, code_review_response, re.DOTALL)
@@ -1076,7 +1126,7 @@ def group_vote_review_pr(
         # note DBSCAN expects a shape with less than or equal to 2 dimensions
         try:
             if all_flattened_embeddings.size:
-                db = DBSCAN(eps=0.253, min_samples=2).fit(all_flattened_embeddings)
+                db = DBSCAN(eps=0.375, min_samples=2).fit(all_flattened_embeddings)
                 groups_to_labels[group_name] = db.labels_
             else:
                 groups_to_labels[group_name] = []