Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ build/*
dist/*
env/*
.vscode/
.idea/
.ipynb_checkpoints/
config.json
__pycache__/
Expand Down
18 changes: 17 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,11 @@ Inspect4py currently works **only for Python 3 projects**.

## Background:

`inspect4py` added the functionality of capture [Data Flow Graphs](http://bears.ece.ucsb.edu/research-info/DP/dfg.html) for each function inspired by GraphCodeBERT: [Github](https://github.com/microsoft/CodeBERT) & [Paper](https://arxiv.org/abs/2009.08366). The illustration is given:
|Source Code|List Output|Networkx Image|
|:-:|:-:|:-:|
|<pre>def max(a, b):<br>x = 0<br> if a > b:<br> x = a<br>else:<br> x = b<br> return x</pre>|<pre>('a', 3, 'comesFrom', [], [])<br>('b', 5, 'comesFrom', [], [])<br>('x', 8, 'computedFrom', ['0'], [10])<br>('0', 10, 'comesFrom', [], [])<br>('a', 12, 'comesFrom', ['a'], [3])<br>('b', 14, 'comesFrom', ['b'], [5])<br>('x', 16, 'computedFrom', ['a'], [18])<br>('a', 18, 'comesFrom', ['a'], [3])<br>('x', 21, 'computedFrom', ['b'], [23])<br>('b', 23, 'comesFrom', ['b'], [5])<br>('x', 25, 'comesFrom', ['x'], [16, 21])</pre>|![image](docs/images/data_flow.png)|

`inspect4py` uses [ASTs](https://en.wikipedia.org/wiki/Abstract_syntax_tree), more specifically
the [ast](https://docs.python.org/3/library/ast.html) module in Python, generating
a tree of objects (per file) whose classes all inherit from [ast.AST](https://docs.python.org/3/library/ast.html#ast.AST).
Expand Down Expand Up @@ -60,6 +65,12 @@ Please cite our MSR 2022 demo paper:

### Preliminaries

Make sure you have tree-sitter installed, C complier is needed, more [info](https://github.com/tree-sitter/tree-sitter):

```
pip install tree-sitter
```

Make sure you have graphviz installed:

```
Expand All @@ -71,7 +82,7 @@ We have tested `inspect4py` in Python 3.7+. **Our recommended version is Python


### Operative System
We have tested `inspect4py` in Unix and MacOs.
We have tested `inspect4py` in Unix, MacOS and Windows 11(22621.1265).

### Installation from pypi
`inspect4py` is [available in pypi!](https://pypi.org/project/inspect4py/) Just install it like a regular package:
Expand Down Expand Up @@ -106,6 +117,9 @@ pigar
setuptools==54.2.0
json2html
configparser
bigcode_astgen
GitPython
tree-sitter
```

If you want to run the evaluations, do not forget to add `pandas` to the previous set.
Expand Down Expand Up @@ -218,6 +232,8 @@ Options:
-rm, --readme extract all readme files in the target repository.
-md, --metadata extract metadata of the target repository using
Github API.
-df, --data_flow extract data flow graph for every function, BOOL
-st, --symbol_table symbol table file location. STR
--help Show this message and exit.
```

Expand Down
Binary file added docs/images/data_flow.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
2 changes: 1 addition & 1 deletion inspect4py/__init__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = '0.0.6'
__version__ = '0.0.7'
60 changes: 51 additions & 9 deletions inspect4py/cli.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,16 @@
import ast
import json
import tokenize
import types
import builtins
import click
from docstring_parser import parse as doc_parse
from tree_sitter import Language, Parser

from inspect4py import __version__
from inspect4py.staticfg import builder
from inspect4py.utils import *
# from utils import *

"""
Code Inspector
Expand All @@ -26,7 +29,7 @@


class CodeInspection:
def __init__(self, path, out_control_flow_path, out_json_path, control_flow, abstract_syntax_tree, source_code):
def __init__(self, path, out_control_flow_path, out_json_path, control_flow, abstract_syntax_tree, source_code, data_flow, parser):
""" init method initializes the Code_Inspection object
:param self self: represent the instance of the class
:param str path: the file to inspect
Expand All @@ -41,6 +44,8 @@ def __init__(self, path, out_control_flow_path, out_json_path, control_flow, abs
self.out_json_path = out_json_path
self.abstract_syntax_tree = abstract_syntax_tree
self.source_code = source_code
self.data_flow = data_flow
self.parser = parser
self.tree = self.parser_file()
if self.tree != "AST_ERROR":
self.nodes = self.walk()
Expand All @@ -51,13 +56,14 @@ def __init__(self, path, out_control_flow_path, out_json_path, control_flow, abs
self.bodyInfo = self.inspect_body()
if control_flow:
self.out_control_flow_path = out_control_flow_path
self.controlFlowInfo = self.inspect_controlflow()
self.controlFlowInfo = self.inspect_controlflow("png")
else:
self.controlFlowInfo = {}
self.fileJson = self.file_json()
else:
self.fileJson = {}


def find_classDef(self):
classDef_nodes = [node for node in self.nodes if isinstance(node, ast.ClassDef)]
class_init=[]
Expand Down Expand Up @@ -466,6 +472,13 @@ def file_json(self):
json.dump(prune_json(file_dict), outfile)
return [file_dict, json_file]

# def get_parser_data_flow(self):
# parser = Parser()
# LANGUAGE = Language(self.symbol_table, "python")
# parser.set_language(LANGUAGE)
# parser = [parser, DFG_python]
# return parser

def _f_definitions(self, functions_definitions):
"""_f_definitions extracts the name, args, docstring
returns, raises of a list of functions or a methods.
Expand All @@ -477,11 +490,15 @@ def _f_definitions(self, functions_definitions):
:param list functions_definitions: represent a list with all functions or methods nodes
:return dictionary: a dictionary with the all the information at function/method level
"""

# print(functions_definitions)
funcs_info = {}
for f in functions_definitions:
# for node in ast.walk(f):
# print(node.name)

funcs_info[f.name] = {}
ds_f = ast.get_docstring(f)
# print(ds_f)
try:
docstring = doc_parse(ds_f)
funcs_info[f.name]["doc"] = {}
Expand Down Expand Up @@ -577,7 +594,10 @@ def _f_definitions(self, functions_definitions):
funcs_info[f.name]["ast"] = ast_to_json(f)
if self.source_code:
funcs_info[f.name]["source_code"] = ast_to_source_code(f)

if self.data_flow:
code_tokens, dfg = extract_dataflow(funcs_info[f.name]["source_code"], self.parser, "python")
funcs_info[f.name]["data_flow"] = dfg
funcs_info[f.name]["code_tokens"] = code_tokens
return funcs_info

def _skip_dynamic_calls(self, funcs_info, classes_info, check_name, name, var_name):
Expand Down Expand Up @@ -1204,6 +1224,7 @@ def create_output_dirs(output_dir, control_flow):
@click.option('-i', '--input_path', type=str, required=True, help="input path of the file or directory to inspect.")
@click.option('-o', '--output_dir', type=str, default="output_dir",
help="output directory path to store results. If the directory does not exist, the tool will create it.")
@click.option('-st','--symbol_table', type=str, default="my_language.so", help="symbol table for the target function")
@click.option('-ignore_dir', '--ignore_dir_pattern', multiple=True, default=[".", "__pycache__"],
help="ignore directories starting with a certain pattern. This parameter can be provided multiple times "
"to ignore multiple directory patterns.")
Expand Down Expand Up @@ -1231,16 +1252,35 @@ def create_output_dirs(output_dir, control_flow):
help="extract all readme files in the target repository.")
@click.option('-md', '--metadata', type=bool, is_flag=True,
help="extract metadata of the target repository using Github API. (requires repository to have the .git folder)")
@click.option('-df', '--data_flow', type=bool, is_flag=True,
help="extract data flow graph of every function in the target repository")

def main(input_path, output_dir, ignore_dir_pattern, ignore_file_pattern, requirements, html_output, call_list,
control_flow, directory_tree, software_invocation, abstract_syntax_tree, source_code, license_detection, readme,
metadata):
metadata, data_flow, symbol_table):
if data_flow:
if symbol_table == "my_language.so": # default option
path_to_languages = str(Path(__file__).parent / "resources")
if sys.platform.startswith("win") or sys.platform.startswith("cygwin"):
language = Language(path_to_languages + os.path.sep + "python_win.so", "python")
else:
language = Language(path_to_languages + os.path.sep + "python_unix.so", "python")
else:
language = Language(symbol_table, "python")
parser = Parser()
parser.set_language(language)
parser = [parser, DFG_python]
else:
parser = []

# print(parsers)
if (not os.path.isfile(input_path)) and (not os.path.isdir(input_path)):
print('The file or directory specified does not exist')
sys.exit()

if os.path.isfile(input_path):
cf_dir, json_dir = create_output_dirs(output_dir, control_flow)
code_info = CodeInspection(input_path, cf_dir, json_dir, control_flow, abstract_syntax_tree, source_code)
code_info = CodeInspection(input_path, cf_dir, json_dir, control_flow, abstract_syntax_tree, source_code, data_flow, parser)

# Generate the call list of a file
call_list_data = call_list_file(code_info)
Expand Down Expand Up @@ -1279,18 +1319,20 @@ def main(input_path, output_dir, ignore_dir_pattern, ignore_file_pattern, requir
for f in files:
if ".py" in f and not f.endswith(".pyc"):
try:

path = os.path.join(subdir, f)
relative_path = Path(subdir).relative_to(Path(input_path).parent)
out_dir = str(Path(output_dir) / relative_path)
cf_dir, json_dir = create_output_dirs(out_dir, control_flow)
code_info = CodeInspection(path, cf_dir, json_dir, control_flow, abstract_syntax_tree, source_code)
code_info = CodeInspection(path, cf_dir, json_dir, control_flow, abstract_syntax_tree, source_code, data_flow, parser)
# print(parsers)
if code_info.fileJson:
if out_dir not in dir_info:
dir_info[out_dir] = [code_info.fileJson[0]]
else:
dir_info[out_dir].append(code_info.fileJson[0])
except:
print("Error when processing " + f + ": ", sys.exc_info()[0])
print("Error when processing " + f + ": ", sys.exc_info())
continue

# Generate the call list of the Dir
Expand Down Expand Up @@ -1332,7 +1374,7 @@ def main(input_path, output_dir, ignore_dir_pattern, ignore_file_pattern, requir
dir_info["software_type"] = "not found"
if license_detection:
try:
licenses_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "licenses")
licenses_path = str(Path(__file__).parent / "licenses")
license_text = extract_license(input_path)
rank_list = detect_license(license_text, licenses_path)
dir_info["license"] = {}
Expand Down
4 changes: 3 additions & 1 deletion inspect4py/parse_setup_files.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,9 @@ def parse_setup_py(parent_dir):
if single_line:
elem = setup_content[console_index]
cs = elem.split("=")
cs_string = cs[0].strip().replace('\'', '').split('["')[1]
# print(cs)
# print(cs[1].strip())
cs_string = cs[1].strip().replace('\'', '').split('["')[1]
cs_list.append(normalize(cs_string))
setup_info["installation"] = "pip install " + cs_string
setup_info["run"].append(cs_string)
Expand Down
Binary file added inspect4py/resources/python_unix.so
Binary file not shown.
Binary file added inspect4py/resources/python_win.so
Binary file not shown.
Loading