From c5a89c04957af49d0a99edbcd059861d6a632229 Mon Sep 17 00:00:00 2001 From: Rangeet Pan Date: Fri, 6 Sep 2024 10:43:52 -0400 Subject: [PATCH 1/5] support for adding incremental analysis --- .../java/codeanalyzer/codeanalyzer.py | 33 ++++++++++++---- cldk/analysis/java/java.py | 33 +++++++++------- cldk/core.py | 39 ++++++++++++------- cldk/models/java/models.py | 1 + 4 files changed, 72 insertions(+), 34 deletions(-) diff --git a/cldk/analysis/java/codeanalyzer/codeanalyzer.py b/cldk/analysis/java/codeanalyzer/codeanalyzer.py index eff3fa8..4a6bb2f 100644 --- a/cldk/analysis/java/codeanalyzer/codeanalyzer.py +++ b/cldk/analysis/java/codeanalyzer/codeanalyzer.py @@ -64,6 +64,7 @@ def __init__( analysis_level: str, use_graalvm_binary: bool, eager_analysis: bool, + target_files: List[str] | None ) -> None: self.project_dir = project_dir self.source_code = source_code @@ -72,6 +73,7 @@ def __init__( self.use_graalvm_binary = use_graalvm_binary self.eager_analysis = eager_analysis self.analysis_level = analysis_level + self.target_files = target_files self.application = self._init_codeanalyzer( analysis_level=1 if analysis_level == "symbol_table" else 2 ) @@ -230,9 +232,17 @@ def _init_codeanalyzer(self, analysis_level=1) -> JApplication: if self.analysis_json_path is None: logger.info("Reading analysis from the pipe.") - codeanalyzer_args = codeanalyzer_exec + shlex.split( - f"-i {Path(self.project_dir)} --analysis-level={analysis_level}" - ) + codeanalyzer_args = '' + # If target file is provided, the input is merged into a single string and passed to codeanalyzer + if self.target_files: + target_file_options = ' '.join([s.strip() for s in self.target_files]) + codeanalyzer_args = codeanalyzer_exec + shlex.split( + f"-i {Path(self.project_dir)} --analysis-level={analysis_level} --target-files={target_file_options}" + ) + else: + codeanalyzer_args = codeanalyzer_exec + shlex.split( + f"-i {Path(self.project_dir)} --analysis-level={analysis_level}" + ) try: logger.info(f"Running codeanalyzer: {' '.join(codeanalyzer_args)}") console_out: CompletedProcess[str] = subprocess.run( @@ -254,10 +264,18 @@ def _init_codeanalyzer(self, analysis_level=1) -> JApplication: # flag is set, we'll run the analysis every time the object is created. This will happen regradless # of the existence of the analysis file. # Create the executable command for codeanalyzer. - codeanalyzer_args = codeanalyzer_exec + shlex.split( - f"-i {Path(self.project_dir)} --analysis-level={analysis_level} -o {self.analysis_json_path}" - ) - + codeanalyzer_args = '' + # If target file is provided, the input is merged into a single string and passed to codeanalyzer + if self.target_files: + target_file_options = ' '.join([s.strip() for s in self.target_files]) + codeanalyzer_args = codeanalyzer_exec + shlex.split( + f"-i {Path(self.project_dir)} --analysis-level={analysis_level}" + f" -o {self.analysis_json_path} --target-files={target_file_options}" + ) + else: + codeanalyzer_args = codeanalyzer_exec + shlex.split( + f"-i {Path(self.project_dir)} --analysis-level={analysis_level} -o {self.analysis_json_path}" + ) try: logger.info( f"Running codeanalyzer subprocess with args {codeanalyzer_args}" @@ -289,7 +307,6 @@ def _codeanalyzer_single_file(self): JApplication The application view of the Java code with the analysis results. """ - # self.source_code: str = re.sub(r"[\r\n\t\f\v]+", lambda x: " " if x.group() in "\t\f\v" else " ", self.source_code) codeanalyzer_exec = self._get_codeanalyzer_exec() codeanalyzer_args = ["--source-analysis", self.source_code] codeanalyzer_cmd = codeanalyzer_exec + codeanalyzer_args diff --git a/cldk/analysis/java/java.py b/cldk/analysis/java/java.py index 0b02684..5d29564 100644 --- a/cldk/analysis/java/java.py +++ b/cldk/analysis/java/java.py @@ -15,15 +15,16 @@ class JavaAnalysis(SymbolTable, CallGraph): def __init__( - self, - project_dir: str | Path | None, - source_code: str | None, - analysis_backend: str, - analysis_backend_path: str | None, - analysis_json_path: str | Path | None, - analysis_level: str, - use_graalvm_binary: bool, - eager_analysis: bool, + self, + project_dir: str | Path | None, + source_code: str | None, + analysis_backend: str, + analysis_backend_path: str | None, + analysis_json_path: str | Path | None, + analysis_level: str, + target_files: List[str] | None, + use_graalvm_binary: bool, + eager_analysis: bool, ) -> None: """ Parameters @@ -44,7 +45,9 @@ def __init__( eager_analysis : bool, optional A flag indicating whether to perform eager analysis, defaults to False. If True, the analysis is performed eagerly. That is, the analysis.json file is created during analysis every time even if it already exists. - + target_files: str, optional + The target files for which the analysis will run or get modified. Currently, this feature only supported + with symbol table analysis. In the future, we will add this feature to other analysis levels. Attributes ---------- analysis_backend : JCodeQL | JApplication @@ -59,7 +62,8 @@ def __init__( self.analysis_backend_path = analysis_backend_path self.eager_analysis = eager_analysis self.use_graalvm_binary = use_graalvm_binary - self.analysis_backend = analysis_backend + self.analysis_backend = analysis_backend + self.target_files = target_files # Initialize the analysis analysis_backend if analysis_backend.lower() == "codeql": self.analysis_backend: JCodeQL = JCodeQL(self.project_dir, self.analysis_json_path) @@ -72,6 +76,7 @@ def __init__( analysis_json_path=self.analysis_json_path, use_graalvm_binary=self.use_graalvm_binary, analysis_backend_path=self.analysis_backend_path, + target_files=self.target_files ) else: raise NotImplementedError(f"Support for {analysis_backend} has not been implemented yet.") @@ -438,7 +443,9 @@ def get_implemented_interfaces(self, qualified_class_name) -> List[str]: raise NotImplementedError(f"Support for this functionality has not been implemented yet.") return self.backend.get_implemented_interfaces(qualified_class_name) - def __get_class_call_graph_using_symbol_table(self, qualified_class_name: str, method_signature: str | None = None) -> (List)[Tuple[JMethodDetail, JMethodDetail]]: + def __get_class_call_graph_using_symbol_table(self, qualified_class_name: str, + method_signature: str | None = None) -> (List)[ + Tuple[JMethodDetail, JMethodDetail]]: """ A call graph using symbol table for a given class and a given method. Args: @@ -478,7 +485,7 @@ def get_class_call_graph(self, qualified_class_name: str, method_signature: str """ if using_symbol_table: return self.__get_class_call_graph_using_symbol_table(qualified_class_name=qualified_class_name, - method_signature=method_signature) + method_signature=method_signature) if self.analysis_backend in [AnalysisEngine.CODEQL, AnalysisEngine.TREESITTER]: raise NotImplementedError(f"Support for this functionality has not been implemented yet.") return self.backend.get_class_call_graph(qualified_class_name, method_signature) diff --git a/cldk/core.py b/cldk/core.py index ec6f07b..91445c9 100644 --- a/cldk/core.py +++ b/cldk/core.py @@ -1,8 +1,9 @@ from pathlib import Path - import logging +from typing import List +from cldk.analysis import AnalysisLevel from cldk.analysis.java import JavaAnalysis from cldk.analysis.java.treesitter import JavaSitter from cldk.utils.exceptions import CldkInitializationException @@ -30,15 +31,16 @@ def __init__(self, language: str): self.language: str = language def analysis( - self, - project_path: str | Path | None = None, - source_code: str | None = None, - eager: bool = False, - analysis_backend: str | None = "codeanalyzer", - analysis_level: str = "symbol_table", - analysis_backend_path: str | None = None, - analysis_json_path: str | Path = None, - use_graalvm_binary: bool = False, + self, + project_path: str | Path | None = None, + source_code: str | None = None, + eager: bool = False, + analysis_backend: str | None = "codeanalyzer", + analysis_level: str = AnalysisLevel.symbol_table, + target_files: List[str] | None = None, + analysis_backend_path: str | None = None, + analysis_json_path: str | Path = None, + use_graalvm_binary: bool = False, ) -> JavaAnalysis: """ Initialize the preprocessor based on the specified language and analysis_backend. @@ -65,7 +67,11 @@ def analysis( eager : bool, optional A flag indicating whether to perform eager analysis, defaults to False. If True, the analysis is performed eagerly. That is, the analysis.json file is created during analysis every time even if it already exists. - + analysis_level: str, optional + Analysis levels. Refer to AnalysisLevel. + target_files: List[str] | None, optional + The target files (paths) for which the analysis will run or get modified. Currently, this feature only supported + with symbol table analysis. In the future, we will add this feature to other analysis levels. Returns ------- JavaAnalysis @@ -77,13 +83,19 @@ def analysis( If neither project_path nor source_code is provided. NotImplementedError If the specified language is not implemented yet. + + Args: + analysis_level: + target_files: + analysis_level: """ if project_path is None and source_code is None: raise CldkInitializationException("Either project_path or source_code must be provided.") if project_path is not None and source_code is not None: - raise CldkInitializationException("Both project_path and source_code are provided. Please provide " "only one.") + raise CldkInitializationException( + "Both project_path and source_code are provided. Please provide " "only one.") if self.language == "java": return JavaAnalysis( @@ -94,6 +106,7 @@ def analysis( analysis_backend_path=analysis_backend_path, analysis_json_path=analysis_json_path, use_graalvm_binary=use_graalvm_binary, + target_files=target_files, eager_analysis=eager, ) else: @@ -114,7 +127,7 @@ def treesitter_parser(self): else: raise NotImplementedError(f"Treesitter parser for {self.language} is not implemented yet.") - def tree_sitter_utils(self, source_code: str) -> [TreesitterSanitizer| NotImplementedError]: + def tree_sitter_utils(self, source_code: str) -> [TreesitterSanitizer | NotImplementedError]: """ Parse the project using treesitter. diff --git a/cldk/models/java/models.py b/cldk/models/java/models.py index 25713e2..0743502 100644 --- a/cldk/models/java/models.py +++ b/cldk/models/java/models.py @@ -291,6 +291,7 @@ class JType(BaseModel): is_record_declaration: bool = False is_concrete_class: bool = False is_entry_point: bool = False + is_modified: bool = False comment: str extends_list: List[str] = [] implements_list: List[str] = [] From 399199c3cb6db56bfd737f4bd5c0a32ecc3766c0 Mon Sep 17 00:00:00 2001 From: Rangeet Pan Date: Mon, 9 Sep 2024 20:01:21 -0400 Subject: [PATCH 2/5] change subprocess command to add -t for target files --- cldk/analysis/java/codeanalyzer/codeanalyzer.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/cldk/analysis/java/codeanalyzer/codeanalyzer.py b/cldk/analysis/java/codeanalyzer/codeanalyzer.py index 4a6bb2f..207a348 100644 --- a/cldk/analysis/java/codeanalyzer/codeanalyzer.py +++ b/cldk/analysis/java/codeanalyzer/codeanalyzer.py @@ -235,9 +235,9 @@ def _init_codeanalyzer(self, analysis_level=1) -> JApplication: codeanalyzer_args = '' # If target file is provided, the input is merged into a single string and passed to codeanalyzer if self.target_files: - target_file_options = ' '.join([s.strip() for s in self.target_files]) + target_file_options = '-t '.join([s.strip() for s in self.target_files]) codeanalyzer_args = codeanalyzer_exec + shlex.split( - f"-i {Path(self.project_dir)} --analysis-level={analysis_level} --target-files={target_file_options}" + f"-i {Path(self.project_dir)} --analysis-level={analysis_level} -t {target_file_options}" ) else: codeanalyzer_args = codeanalyzer_exec + shlex.split( @@ -267,10 +267,10 @@ def _init_codeanalyzer(self, analysis_level=1) -> JApplication: codeanalyzer_args = '' # If target file is provided, the input is merged into a single string and passed to codeanalyzer if self.target_files: - target_file_options = ' '.join([s.strip() for s in self.target_files]) + target_file_options = '-t '.join([s.strip() for s in self.target_files]) codeanalyzer_args = codeanalyzer_exec + shlex.split( f"-i {Path(self.project_dir)} --analysis-level={analysis_level}" - f" -o {self.analysis_json_path} --target-files={target_file_options}" + f" -o {self.analysis_json_path} -t {target_file_options}" ) else: codeanalyzer_args = codeanalyzer_exec + shlex.split( From 50df25f89a69fab39d813183bee9b07d210c2e04 Mon Sep 17 00:00:00 2001 From: Rangeet Pan Date: Mon, 9 Sep 2024 20:03:45 -0400 Subject: [PATCH 3/5] change subprocess command to add -t for target files --- cldk/models/java/models.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cldk/models/java/models.py b/cldk/models/java/models.py index 0743502..d34e3e9 100644 --- a/cldk/models/java/models.py +++ b/cldk/models/java/models.py @@ -291,7 +291,6 @@ class JType(BaseModel): is_record_declaration: bool = False is_concrete_class: bool = False is_entry_point: bool = False - is_modified: bool = False comment: str extends_list: List[str] = [] implements_list: List[str] = [] @@ -342,6 +341,7 @@ class JCompilationUnit(BaseModel): comment: str imports: List[str] type_declarations: Dict[str, JType] + is_modified: bool = False class JMethodDetail(BaseModel): From 5e2d45cbc961fcfcc84bc3085707851daead094d Mon Sep 17 00:00:00 2001 From: Rangeet Pan Date: Mon, 9 Sep 2024 21:44:46 -0400 Subject: [PATCH 4/5] add logic to run codenalyzer when target files are provided --- .../java/codeanalyzer/codeanalyzer.py | 41 ++++++++++--------- 1 file changed, 22 insertions(+), 19 deletions(-) diff --git a/cldk/analysis/java/codeanalyzer/codeanalyzer.py b/cldk/analysis/java/codeanalyzer/codeanalyzer.py index 355c7ec..c6068f1 100644 --- a/cldk/analysis/java/codeanalyzer/codeanalyzer.py +++ b/cldk/analysis/java/codeanalyzer/codeanalyzer.py @@ -200,10 +200,9 @@ def _init_codeanalyzer(self, analysis_level=1) -> JApplication: """ codeanalyzer_exec = self._get_codeanalyzer_exec() - + codeanalyzer_args = '' if self.analysis_json_path is None: logger.info("Reading analysis from the pipe.") - codeanalyzer_args = '' # If target file is provided, the input is merged into a single string and passed to codeanalyzer if self.target_files: target_file_options = '-t '.join([s.strip() for s in self.target_files]) @@ -227,24 +226,29 @@ def _init_codeanalyzer(self, analysis_level=1) -> JApplication: raise CodeanalyzerExecutionException(str(e)) from e else: + # Check if the code analyzer needs to be run + is_run_code_analyzer = False analysis_json_path_file = Path(self.analysis_json_path).joinpath("analysis.json") - if not analysis_json_path_file.exists() or self.eager_analysis: - # If the analysis file does not exist, we'll run the analysis. Alternately, if the eager_analysis - # flag is set, we'll run the analysis every time the object is created. This will happen regradless - # of the existence of the analysis file. - # Create the executable command for codeanalyzer. - codeanalyzer_args = '' - # If target file is provided, the input is merged into a single string and passed to codeanalyzer - if self.target_files: - target_file_options = '-t '.join([s.strip() for s in self.target_files]) - codeanalyzer_args = codeanalyzer_exec + shlex.split( - f"-i {Path(self.project_dir)} --analysis-level={analysis_level}" - f" -o {self.analysis_json_path} -t {target_file_options}" - ) - else: + # If target file is provided, the input is merged into a single string and passed to codeanalyzer + if self.target_files: + target_file_options = '-t '.join([s.strip() for s in self.target_files]) + codeanalyzer_args = codeanalyzer_exec + shlex.split( + f"-i {Path(self.project_dir)} --analysis-level={analysis_level}" + f" -o {self.analysis_json_path} -t {target_file_options}" + ) + is_run_code_analyzer = True + else: + if not analysis_json_path_file.exists() or self.eager_analysis: + # If the analysis file does not exist, we'll run the analysis. Alternately, if the eager_analysis + # flag is set, we'll run the analysis every time the object is created. This will happen regradless + # of the existence of the analysis file. + # Create the executable command for codeanalyzer. codeanalyzer_args = codeanalyzer_exec + shlex.split( - f"-i {Path(self.project_dir)} --analysis-level={analysis_level} -o {self.analysis_json_path}" - ) + f"-i {Path(self.project_dir)} --analysis-level={analysis_level} -o {self.analysis_json_path}" + ) + is_run_code_analyzer = True + + if is_run_code_analyzer: try: logger.info(f"Running codeanalyzer subprocess with args {codeanalyzer_args}") subprocess.run( @@ -258,7 +262,6 @@ def _init_codeanalyzer(self, analysis_level=1) -> JApplication: except Exception as e: raise CodeanalyzerExecutionException(str(e)) from e - with open(analysis_json_path_file) as f: data = json.load(f) return JApplication(**data) From 9c45cf3fdba5acda162ad839a74d625f845f4ff3 Mon Sep 17 00:00:00 2001 From: Rangeet Pan Date: Mon, 9 Sep 2024 21:49:05 -0400 Subject: [PATCH 5/5] add logic to run codenalyzer when target files are provided --- cldk/analysis/java/codeanalyzer/codeanalyzer.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cldk/analysis/java/codeanalyzer/codeanalyzer.py b/cldk/analysis/java/codeanalyzer/codeanalyzer.py index c6068f1..41d260b 100644 --- a/cldk/analysis/java/codeanalyzer/codeanalyzer.py +++ b/cldk/analysis/java/codeanalyzer/codeanalyzer.py @@ -205,7 +205,7 @@ def _init_codeanalyzer(self, analysis_level=1) -> JApplication: logger.info("Reading analysis from the pipe.") # If target file is provided, the input is merged into a single string and passed to codeanalyzer if self.target_files: - target_file_options = '-t '.join([s.strip() for s in self.target_files]) + target_file_options = ' -t '.join([s.strip() for s in self.target_files]) codeanalyzer_args = codeanalyzer_exec + shlex.split( f"-i {Path(self.project_dir)} --analysis-level={analysis_level} -t {target_file_options}" ) @@ -231,7 +231,7 @@ def _init_codeanalyzer(self, analysis_level=1) -> JApplication: analysis_json_path_file = Path(self.analysis_json_path).joinpath("analysis.json") # If target file is provided, the input is merged into a single string and passed to codeanalyzer if self.target_files: - target_file_options = '-t '.join([s.strip() for s in self.target_files]) + target_file_options = ' -t '.join([s.strip() for s in self.target_files]) codeanalyzer_args = codeanalyzer_exec + shlex.split( f"-i {Path(self.project_dir)} --analysis-level={analysis_level}" f" -o {self.analysis_json_path} -t {target_file_options}"