overseer/main.py
2025-01-15 14:31:35 +01:00

537 lines
19 KiB
Python

from pathlib import Path
from typing import List, Dict, Set
from rich.console import Console
from rich.table import Table
import os
import config
import argparse
from pathspec import PathSpec
from pathspec.patterns import GitWildMatchPattern
import pandas as pd
from fpdf import FPDF
from rich.progress import (
Progress,
SpinnerColumn,
TextColumn,
TimeElapsedColumn,
BarColumn,
TaskProgressColumn,
)
class CommentScanner:
def __init__(
self,
workspace_path: str = None,
skip_markers: Set[str] = None,
show_context: bool = True,
):
# Resolve relative paths
workspace_path = workspace_path or config.DEFAULT_WORKSPACE
self.workspace_path = Path(workspace_path).resolve()
self.console = Console()
self.exclude_patterns = self._load_gitignore()
self.skip_markers = skip_markers or config.DEFAULT_SKIP_MARKERS
self.show_context = show_context
def _load_gitignore(self) -> PathSpec:
gitignore_patterns = []
gitignore_path = self.workspace_path / ".gitignore"
# Add default exclusions
for exclude in config.DEFAULT_EXCLUDES:
gitignore_patterns.append(exclude)
# Read .gitignore if it exists
if gitignore_path.exists():
with open(gitignore_path, "r", encoding="utf-8") as f:
gitignore_patterns.extend(
line.strip()
for line in f
if line.strip() and not line.startswith("#")
)
return PathSpec.from_lines(GitWildMatchPattern, gitignore_patterns)
def should_skip_path(
self,
path: Path,
filename_filter: str = None,
case_sensitive: bool = False,
complete_match: bool = False,
) -> bool:
"""Check if a path should be skipped based on exclusion rules and filename filter."""
try:
# Convert path to relative path from workspace root
rel_path = path.relative_to(self.workspace_path)
# Apply filename filter if provided
if filename_filter:
filename = path.name
if complete_match:
if case_sensitive:
if filename != filename_filter:
return True
else:
if filename.lower() != filename_filter.lower():
return True
else:
if case_sensitive:
if filename_filter not in filename:
return True
else:
if filename_filter.lower() not in filename.lower():
return True
# Check if path matches gitignore patterns
if self.exclude_patterns.match_file(str(rel_path)):
return True
# Skip hidden files and directories
if any(part.startswith(".") for part in path.parts):
return True
return False
except ValueError: # For paths outside workspace
return True
def get_context_lines(self, all_lines: List[str], comment_line_idx: int) -> str:
context = []
start_idx = max(0, comment_line_idx - config.CONTEXT_LINES)
end_idx = min(len(all_lines), comment_line_idx + config.CONTEXT_LINES + 1)
# Get lines before
for i in range(start_idx, comment_line_idx):
line = all_lines[i].strip()
if line: # Skip empty lines
context.append(f" {line}")
# Add the comment line itself
context.append(f"{all_lines[comment_line_idx].strip()}")
# Get lines after
for i in range(comment_line_idx + 1, end_idx):
line = all_lines[i].strip()
if line: # Skip empty lines
context.append(f" {line}")
return "\n".join(context)
def scan_file(self, file_path: Path) -> List[Dict]:
comments = []
file_extension = file_path.suffix.lower()[1:]
# Skip files we don't support
if file_extension not in config.COMMENT_PATTERNS:
return comments
comment_patterns = config.COMMENT_PATTERNS[file_extension]
# Pre-compile patterns for faster matching
single_patterns = comment_patterns.get("single", [])
multiline_pattern = comment_patterns.get("multiline")
# Quick check if file might contain any markers
try:
with open(file_path, "r", encoding="utf-8") as f:
content = f.read()
if not any(marker in content for marker in config.COMMENT_MARKERS):
return comments
# Reset file pointer and continue with line-by-line processing
f.seek(0)
lines = f.readlines()
except UnicodeDecodeError:
return comments # Skip binary files
in_multiline_comment = False
multiline_content = []
for line_num, line in enumerate(lines):
stripped_line = line.strip()
if not stripped_line: # Skip empty lines early
continue
# Fast path: check if line might contain any comment
if not any(
pattern in stripped_line for pattern in single_patterns
) and not (
multiline_pattern
and (
multiline_pattern[0] in stripped_line
or multiline_pattern[1] in stripped_line
)
):
continue
# Handle multiline comments
if multiline_pattern:
start_pattern, end_pattern = multiline_pattern
if (
start_pattern in stripped_line
and end_pattern
in stripped_line[
stripped_line.find(start_pattern) + len(start_pattern) :
]
):
comment_text = stripped_line[
stripped_line.find(start_pattern)
+ len(start_pattern) : stripped_line.rfind(end_pattern)
].strip()
self._process_comment(
comment_text, comments, file_path, line_num, lines
)
continue
if start_pattern in stripped_line and not in_multiline_comment:
in_multiline_comment = True
multiline_content = [
stripped_line[
stripped_line.find(start_pattern) + len(start_pattern) :
].strip()
]
continue
if in_multiline_comment:
if end_pattern in stripped_line:
in_multiline_comment = False
multiline_content.append(
stripped_line[: stripped_line.find(end_pattern)].strip()
)
comment_text = " ".join(multiline_content)
self._process_comment(
comment_text, comments, file_path, line_num, lines
)
multiline_content = []
else:
multiline_content.append(stripped_line)
continue
# Handle single-line comments
for pattern in single_patterns:
if pattern in stripped_line:
comment_text = stripped_line[
stripped_line.find(pattern) + len(pattern) :
].strip()
self._process_comment(
comment_text, comments, file_path, line_num, lines
)
break
return comments
def _process_comment(
self,
comment_text: str,
comments: List[Dict],
file_path: Path,
line_num: int,
lines: List[str],
) -> None:
"""Helper method to process and add valid comments to the comments list."""
for marker in config.COMMENT_MARKERS:
if comment_text.startswith(marker) and marker not in self.skip_markers:
comments.append(
{
"type": marker,
"text": comment_text[len(marker) :].strip(),
"file": str(file_path.relative_to(self.workspace_path)),
"line": line_num + 1,
"context": self.get_context_lines(lines, line_num),
}
)
break
def scan_workspace(
self,
filename_filter: str = None,
case_sensitive: bool = True,
complete_match: bool = False,
) -> List[Dict]:
all_comments = []
with Progress(
SpinnerColumn(),
TextColumn("[progress.description]{task.description}"),
BarColumn(),
TaskProgressColumn(),
TimeElapsedColumn(),
console=self.console,
) as progress:
# Start with an indeterminate progress bar
scan_task = progress.add_task("[cyan]Scanning files...", total=None)
files_processed = 0
for pattern in config.FILE_PATTERNS:
try:
for file_path in self.workspace_path.rglob(pattern):
if file_path.is_file() and not self.should_skip_path(
file_path, filename_filter, case_sensitive, complete_match
):
try:
files_processed += 1
progress.update(
scan_task,
completed=files_processed,
description=f"[cyan]Scanning: {file_path.name}",
)
file_comments = self.scan_file(file_path)
if file_comments: # Only extend if we found comments
all_comments.extend(file_comments)
except Exception as e:
self.console.print(
f"Error scanning {file_path}: {e}", style="red"
)
except Exception as e:
self.console.print(f"Error during workspace scan: {e}", style="red")
return all_comments
def display_comments(self, comments: List[Dict]):
table = Table(title="Project Comments Overview", show_lines=True)
table.add_column("Type", style="bold")
table.add_column("Comment")
if self.show_context:
table.add_column("Context", style="dim")
table.add_column("File", style="dim")
table.add_column("Line", style="dim")
for comment in sorted(comments, key=lambda x: x["type"]):
row = [
config.COMMENT_MARKERS[comment["type"]],
comment["text"],
comment["file"],
str(comment["line"]),
]
if self.show_context:
row.insert(2, comment["context"])
table.add_row(
*row, style=config.COMMENT_COLORS.get(comment["type"], "white")
)
self.console.print(table)
def export_to_pdf(self, comments: List[Dict], output_path: str):
class PDF(FPDF):
def multi_cell_row(self, heights, cols, border=1):
# Calculate max number of lines for all columns
max_lines = 0
lines = []
# Adjust widths based on whether context is shown
if self.show_context:
widths = [20, 60, 60, 60, 20] # Type, Comment, Context, File, Line
else:
widths = [20, 60, 60, 20] # Type, Comment, File, Line
x_start = self.get_x()
for i, col in enumerate(cols):
self.set_x(x_start)
lines.append(
self.multi_cell(
widths[i], heights, col, border=border, split_only=True
)
)
max_lines = max(max_lines, len(lines[-1]))
# Draw multi-cells with same height
height_of_line = heights
x_start = self.get_x()
for i in range(max_lines):
self.set_x(x_start)
for j, width in enumerate(widths):
content = lines[j][i] if i < len(lines[j]) else ""
self.multi_cell(width, height_of_line, content, border=border)
self.set_xy(self.get_x() + width, self.get_y() - height_of_line)
self.ln(height_of_line)
return max_lines * height_of_line
pdf = PDF()
pdf.set_auto_page_break(auto=True, margin=15)
pdf.add_page()
pdf.set_font("Arial", size=10)
pdf.show_context = self.show_context
# Add title
pdf.set_font("Arial", "B", 14)
pdf.cell(0, 10, "Project Comments Overview", ln=True, align="C")
pdf.ln(5)
pdf.set_font("Arial", size=10)
# Headers
headers = ["Type", "Comment", "File", "Line"]
if self.show_context:
headers.insert(2, "Context")
pdf.set_fill_color(240, 240, 240)
pdf.multi_cell_row(8, headers)
# Content
for comment in sorted(comments, key=lambda x: x["type"]):
try:
row = [
config.COMMENT_MARKERS[comment["type"]],
comment["text"],
comment["file"],
str(comment["line"]),
]
if self.show_context:
# Clean up context for PDF compatibility
context = comment["context"]
context = context.replace("", ">")
context = context.encode("ascii", "replace").decode("ascii")
context = context.replace(
"\n", " | "
) # Replace line breaks with separator
row.insert(2, context)
# Clean up all cells for PDF compatibility
row = [
str(cell).encode("ascii", "replace").decode("ascii") for cell in row
]
pdf.multi_cell_row(8, row)
except Exception as e:
self.console.print(
f"Warning: Skipped row due to encoding issue: {e}", style="yellow"
)
pdf.output(output_path)
def export_to_excel(self, comments: List[Dict], output_path: str):
df_data = []
for comment in comments:
row = {
"Type": config.COMMENT_MARKERS[comment["type"]],
"Comment": comment["text"],
"File": comment["file"],
"Line": comment["line"],
}
if self.show_context:
row["Context"] = comment["context"]
df_data.append(row)
df = pd.DataFrame(df_data)
df.to_excel(output_path, index=False, engine="openpyxl")
def main():
parser = argparse.ArgumentParser(
description="Scan TypeScript project comments",
epilog="""
Examples:
%(prog)s # Scan all files with default settings
%(prog)s -w /path/to/project # Scan a specific workspace
%(prog)s -f test.py # Find comments in files containing 'test.py' (case insensitive)
%(prog)s -f test.py -c # Find comments in files named exactly 'test.py'
%(prog)s -f Test.py -C # Find comments with case-sensitive filename match
%(prog)s -f test.py -c -C # Find comments in files named exactly 'test.py' (case sensitive)
%(prog)s --skip TODO FIXME # Skip TODO and FIXME comments
%(prog)s -a # Include all comment types
%(prog)s -e pdf -o comments.pdf # Export comments to PDF
""",
formatter_class=argparse.RawDescriptionHelpFormatter,
)
parser.add_argument(
"--workspace", "-w", type=str, help="Path to the workspace directory"
)
parser.add_argument(
"--skip",
"-s",
type=str,
nargs="+",
help="Markers to skip (e.g., --skip NOTE TODO)",
default=list(config.DEFAULT_SKIP_MARKERS),
)
parser.add_argument(
"--include-all",
"-a",
action="store_true",
help="Include all markers (override default skip)",
)
parser.add_argument(
"--no-context",
"-nc",
action="store_true",
help="Don't show context lines around comments",
)
parser.add_argument(
"--export",
"-e",
type=str,
choices=config.EXPORT_FORMATS,
help="Export format (pdf or xlsx)",
)
parser.add_argument("--output", "-o", type=str, help="Output file path for export")
# Create a filename filter group
filename_group = parser.add_argument_group("filename filtering")
filename_group.add_argument(
"--filename",
"-f",
type=str,
help="Filter files by filename (case insensitive by default)",
)
filename_group.add_argument(
"--complete-match",
"-c",
action="store_true",
help="Match complete filename instead of partial (only with -f)",
)
filename_group.add_argument(
"--case-sensitive",
"-C",
action="store_true",
help="Make filename filter case sensitive (only with -f)",
)
args = parser.parse_args()
# Update validation
if args.case_sensitive and not args.filename:
parser.error("--case-sensitive can only be used with --filename")
if args.complete_match and not args.filename:
parser.error("--complete-match can only be used with --filename")
try:
skip_markers = set() if args.include_all else set(args.skip)
scanner = CommentScanner(
args.workspace, skip_markers, show_context=not args.no_context
)
comments = scanner.scan_workspace(
filename_filter=args.filename,
case_sensitive=args.case_sensitive,
complete_match=args.complete_match,
)
if not comments:
scanner.console.print("No comments found!", style="yellow")
return
# Display in console
scanner.display_comments(comments)
# Export if requested
if args.export:
if not args.output:
raise ValueError("Output path (-o) is required when exporting")
if args.export == "pdf":
scanner.export_to_pdf(comments, args.output)
elif args.export == "xlsx":
scanner.export_to_excel(comments, args.output)
scanner.console.print(f"\nExported to {args.output}", style="green")
except Exception as e:
Console().print(f"Error: {str(e)}", style="red")
if __name__ == "__main__":
main()