Source code for apps.bazel_parser.cli

r"""Parse bazel query outputs

A larger system description:
- inputs:
  - bazel query //... --output proto > query_result.pb
    - the full dependency tree
  - bazel test //... --build_event_binary_file=test_all.pb
    - bazel run //utils:bep_reader < test_all.pb
    - the execution time related to each test target
  - git_utils.get_file_commit_map_from_follow
    - how files have changed over time, can be used to generate
      probabilities of files changing in the future
- intermediates:
  - representation for source files and bazel together
- outputs:
  - test targets:
    - likelihood of executing
      - expected value of runtime
  - source files:
    - cost in execution time of modification
    - expected cost of file change (based on probability of change * cost)
  - graph with the values above, we could take any set of file inputs and
    describe cost
  - graph that we could identify overly depended upon things

XXX:
 - bazel query --keep_going --noimplicit_deps --output proto "deps(//...)"
   is much bigger than "//..." alone, compare what the differences are
- git log --since="10 years ago" --name-only --pretty=format: | sort \
        | uniq -c | sort -nr
  - this is much faster
  - could identify renames via:
    - git log --since="1 month ago" --name-status --pretty=format: \
            | grep -P 'R[0-9]*\t' | awk '{print $2, "->", $3}'
    - then correct
    - can get commit association via
      - git log --since="1 month ago" --name-status --pretty=format:"%H"
    - statuses are A,M,D,R\d\d\d
```
# Regex pattern to match the git log output
pattern = r"^([AMD])\s+(.+?)(\s*->\s*(.+))?$|^R(\d+)\s+(.+?)\s*->\s*(.+)$"
# Parse each line using the regex
for line in git_log_output.strip().split('\n'):
    match = re.match(pattern, line.strip())
    if match:
        if match.group(1):  # For A, M, D statuses
            change_type = match.group(1)
            old_file = match.group(2)
            new_file = match.group(4) if match.group(4) else None
            print(f"Change type: {change_type}, Old file: {old_file}, "
                  f"New file: {new_file}")
        elif match.group(5):  # For R status (renames)
            change_type = 'R'
            similarity_index = match.group(5)
            old_file = match.group(6)
            new_file = match.group(7)
            print(f"Change type: {change_type}, Similarity index:"
                  f" {similarity_index}, Old file: {old_file}, New file:"
                  f" {new_file}")
```

Example Script:

repo_dir=`pwd`
file_commit_pb=$repo_dir/file_commit.pb
query_pb=$repo_dir/s_result.pb
bep_pb=$repo_dir/test_all.pb
out_gml=$repo_dir/my.gml
out_csv=$repo_dir/my.csv
out_html=$repo_dir/my.html

# Prepare data
bazel query "//... - //docs/... - //third_party/bazel/..." --output proto \
        > $query_pb
bazel test //... --build_event_binary_file=$bep_pb
bazel run //apps/bazel_parser --output_groups=-mypy -- git-capture --repo-dir \
        $repo_dir --days-ago 400 --file-commit-pb $file_commit_pb
# Separate step if we want build timing data
bazel clean
bazel build --noremote_accept_cached \
    --experimental_execution_log_compact_file=exec_log.pb.zst \
    --generate_json_trace_profile --profile=example_profile_new.json \
    //...
# Would then need to process the exec_log.pb.zst file to get timing from it and
# then add to the other timing information

# Process and visualize the data
bazel run //apps/bazel_parser --output_groups=-mypy -- process \
        --file-commit-pb $file_commit_pb --query-pb $query_pb --bep-pb \
        $bep_pb --out-gml $out_gml --out-csv $out_csv
bazel run //apps/bazel_parser --output_groups=-mypy -- visualize \
        --gml $out_gml --out-html $out_html
"""

import datetime
import logging
import pathlib
import subprocess
import sys
import tempfile

import click
import networkx
import pydantic
import yaml

from apps.bazel_parser import panel
from apps.bazel_parser import parsing
from apps.bazel_parser import refinement
from tools import bazel_utils
from tools import git_pb2
from tools import git_utils
from utils import bep_reader


[docs]
logger = logging.getLogger(__name__)



[docs]
PATH_TYPE = click.Path(exists=True, path_type=pathlib.Path)


[docs]
OUT_PATH_TYPE = click.Path(exists=False, path_type=pathlib.Path)




[docs]
class Config(pydantic.BaseModel):
    # Error if extra arguments

[docs]
    model_config = pydantic.ConfigDict(extra="forbid")



[docs]
    query_target: str


[docs]
    test_target: str


[docs]
    days_ago: int


[docs]
    refinement: refinement.RefinementConfig





[docs]
def load_config(config_yaml_path: pathlib.Path, overrides: dict) -> Config:
    with open(config_yaml_path, "r") as f:
        raw_data = yaml.safe_load(f)
    # Apply overrides
    raw_data.update(overrides)
    # Validates and parses
    return Config(**raw_data)




[docs]
def get_config(
    config_file: pathlib.Path | None, days_ago: int | None
) -> Config:
    if config_file:
        overrides = {}
        if days_ago is not None:
            overrides["days_ago"] = days_ago
        return load_config(config_file, overrides=overrides)
    else:
        assert days_ago is not None
        return Config(
            query_target="//...",
            test_target="//...",
            days_ago=days_ago,
            refinement=refinement.RefinementConfig(
                name_patterns=[],
                class_patterns=[],
                class_pattern_to_name_patterns={},
            ),
        )



@click.group()

[docs]
def cli():
    pass



@click.command()
@click.option("--repo-dir", type=PATH_TYPE, required=True)
@click.option("--days-ago", type=int, required=True)
@click.option("--file-commit-pb", type=OUT_PATH_TYPE, required=True)

[docs]
def git_capture(
    repo_dir: pathlib.Path,
    days_ago: int,
    file_commit_pb: pathlib.Path,
) -> None:
    git_query_after = datetime.datetime.now() - datetime.timedelta(
        days=days_ago
    )
    file_commit_map = git_utils.get_file_commit_map_from_log(
        git_directory=repo_dir, after=git_query_after
    )
    file_commit_pb.write_bytes(
        file_commit_map.to_proto().SerializeToString(deterministic=True)
    )



@click.command()
@click.option("--query-pb", type=PATH_TYPE, required=True)
@click.option("--bep-pb", type=PATH_TYPE, required=True)
@click.option("--file-commit-pb", type=PATH_TYPE, required=True)
@click.option("--out-gml", type=OUT_PATH_TYPE, required=True)
@click.option("--out-csv", type=OUT_PATH_TYPE, required=True)
@click.option("--config-file", type=PATH_TYPE, required=False)

[docs]
def process(
    query_pb: pathlib.Path,
    bep_pb: pathlib.Path,
    file_commit_pb: pathlib.Path,
    out_gml: pathlib.Path,
    out_csv: pathlib.Path,
    config_file: pathlib.Path | None,
) -> None:
    # days_ago is unused here, but just placing to get a value
    config = get_config(config_file, days_ago=28)
    logger.info("Query...")
    query_result = bazel_utils.parse_build_output(query_pb.read_bytes())
    logger.info("Runtime...")
    with bep_pb.open("rb") as bep_buf:
        label_to_runtime = bep_reader.get_label_to_runtime(bep_buf)
    logger.info("Probability...")
    file_commit_proto = git_pb2.FileCommitMap()
    file_commit_proto.ParseFromString(file_commit_pb.read_bytes())
    file_commit_map = git_utils.FileCommitMap.from_proto(file_commit_proto)
    logger.info("Get graph data")
    r = parsing.get_repo_graph_data(
        query_result=query_result,
        label_to_runtime=label_to_runtime,
        file_commit_map=file_commit_map,
    )
    # XXX: Probably should refresh afterwards too, right?
    # XXX: Probably want to refine before performing full refresh
    logger.info("Refining...")
    refinement.full_refinement(
        repo=r,
        refinement=config.refinement,
        verbosity=refinement.Verbosity.COUNT,
    )
    r.refresh()
    graph_metrics = r.get_graph_metrics()
    # XXX: What to do with graph_metrics?
    print(graph_metrics)
    r.to_csv(out_csv)
    r.to_gml(out_gml)



@click.command()
@click.option("--repo-dir", type=PATH_TYPE, required=True)
@click.option("--days-ago", type=int, required=False)
@click.option("--config-file", type=PATH_TYPE, required=False)
@click.option("--out-gml", type=OUT_PATH_TYPE, required=False)
@click.option("--out-csv", type=OUT_PATH_TYPE, required=False)

[docs]
def full(
    repo_dir: pathlib.Path,
    days_ago: int | None,
    config_file: pathlib.Path | None,
    out_gml: pathlib.Path | None,
    out_csv: pathlib.Path | None,
) -> None:
    config = get_config(config_file, days_ago=days_ago)
    # Query for graph
    logger.info("Querying...")
    query_pb = subprocess.check_output(
        ["bazel", "query", "--output", "proto", config.query_target],
        cwd=repo_dir,
    )
    query_result = bazel_utils.parse_build_output(query_pb)
    # Test for timing
    logger.info("Testing...")
    with tempfile.NamedTemporaryFile() as tmpfile:
        bep_pb = tmpfile.name
        subprocess.check_call(
            [
                "bazel",
                "test",
                f"--build_event_binary_file={bep_pb}",
                config.test_target,
            ],
            cwd=repo_dir,
        )
        with open(bep_pb, "rb") as bep_buf:
            label_to_runtime = bep_reader.get_label_to_runtime(bep_buf)
    # XXX: Optional build timing data ...
    # Capture git information
    logger.info("History from git...")
    git_query_after = datetime.datetime.now() - datetime.timedelta(
        days=config.days_ago
    )
    file_commit_map = git_utils.get_file_commit_map_from_follow(
        git_directory=repo_dir, after=git_query_after
    )
    logger.info("Parsing...")
    r = parsing.get_repo_graph_data(
        query_result=query_result,
        label_to_runtime=label_to_runtime,
        file_commit_map=file_commit_map,
    )
    logger.info("Refining...")
    refinement.full_refinement(
        repo=r,
        refinement=config.refinement,
        verbosity=refinement.Verbosity.COUNT,
    )
    logger.info("Outputting...")
    graph_metrics = r.get_graph_metrics()
    print(graph_metrics)
    if out_csv is not None:
        r.to_csv(out_csv)
    if out_gml is not None:
        r.to_gml(out_gml)
    logger.info("Done...")



@click.command()
@click.option("--gml", type=PATH_TYPE, required=True)
@click.option("--out-html", type=OUT_PATH_TYPE, required=False)

[docs]
def visualize(
    gml: pathlib.Path,
    out_html: pathlib.Path | None,
) -> None:
    graph = networkx.read_gml(gml)
    panel.run_panel(graph=graph, html_out=out_html)



if __name__ == "__main__":
    logging.basicConfig(
        stream=sys.stderr,
        format=(
            "%(asctime)s - %(levelname)s - %(name)s:%(lineno)d -"
            " %(message)s"
        ),
        datefmt="%Y-%m-%d %H:%M:%S",
        level=logging.DEBUG,
    )
    cli.add_command(git_capture)
    cli.add_command(process)
    cli.add_command(visualize)
    cli.add_command(full)
    cli()