overseer/main.py

from pathlib import Path
from typing import List, Dict, Set
from rich.console import Console
from rich.table import Table
import os
import config
import argparse
from pathspec import PathSpec
from pathspec.patterns import GitWildMatchPattern
import pandas as pd
from fpdf import FPDF
from rich.progress import (
    Progress,
    SpinnerColumn,
    TextColumn,
    TimeElapsedColumn,
    BarColumn,
    TaskProgressColumn,
)


class CommentScanner:
    def __init__(
        self,
        workspace_path: str = None,
        skip_markers: Set[str] = None,
        show_context: bool = True,
    ):
        # Resolve relative paths
        workspace_path = workspace_path or config.DEFAULT_WORKSPACE
        self.workspace_path = Path(workspace_path).resolve()
        self.console = Console()
        self.exclude_patterns = self._load_gitignore()
        self.skip_markers = skip_markers or config.DEFAULT_SKIP_MARKERS
        self.show_context = show_context

    def _load_gitignore(self) -> PathSpec:
        gitignore_patterns = []
        gitignore_path = self.workspace_path / ".gitignore"

        # Add default exclusions
        for exclude in config.DEFAULT_EXCLUDES:
            gitignore_patterns.append(exclude)

        # Read .gitignore if it exists
        if gitignore_path.exists():
            with open(gitignore_path, "r", encoding="utf-8") as f:
                gitignore_patterns.extend(
                    line.strip()
                    for line in f
                    if line.strip() and not line.startswith("#")
                )

        return PathSpec.from_lines(GitWildMatchPattern, gitignore_patterns)

    def should_skip_path(
        self,
        path: Path,
        filename_filter: str = None,
        case_sensitive: bool = False,
        complete_match: bool = False,
    ) -> bool:
        """Check if a path should be skipped based on exclusion rules and filename filter."""
        try:
            # Convert path to relative path from workspace root
            rel_path = path.relative_to(self.workspace_path)

            # Apply filename filter if provided
            if filename_filter:
                filename = path.name
                if complete_match:
                    if case_sensitive:
                        if filename != filename_filter:
                            return True
                    else:
                        if filename.lower() != filename_filter.lower():
                            return True
                else:
                    if case_sensitive:
                        if filename_filter not in filename:
                            return True
                    else:
                        if filename_filter.lower() not in filename.lower():
                            return True

            # Check if path matches gitignore patterns
            if self.exclude_patterns.match_file(str(rel_path)):
                return True

            # Skip hidden files and directories
            if any(part.startswith(".") for part in path.parts):
                return True

            return False
        except ValueError:  # For paths outside workspace
            return True

    def get_context_lines(self, all_lines: List[str], comment_line_idx: int) -> str:
        context = []
        start_idx = max(0, comment_line_idx - config.CONTEXT_LINES)
        end_idx = min(len(all_lines), comment_line_idx + config.CONTEXT_LINES + 1)

        # Get lines before
        for i in range(start_idx, comment_line_idx):
            line = all_lines[i].strip()
            if line:  # Skip empty lines
                context.append(f"  {line}")

        # Add the comment line itself
        context.append(f"→ {all_lines[comment_line_idx].strip()}")

        # Get lines after
        for i in range(comment_line_idx + 1, end_idx):
            line = all_lines[i].strip()
            if line:  # Skip empty lines
                context.append(f"  {line}")

        return "\n".join(context)

    def scan_file(self, file_path: Path) -> List[Dict]:
        comments = []
        file_extension = file_path.suffix.lower()[1:]

        # Skip files we don't support
        if file_extension not in config.COMMENT_PATTERNS:
            return comments

        comment_patterns = config.COMMENT_PATTERNS[file_extension]

        # Pre-compile patterns for faster matching
        single_patterns = comment_patterns.get("single", [])
        multiline_pattern = comment_patterns.get("multiline")

        # Quick check if file might contain any markers
        try:
            with open(file_path, "r", encoding="utf-8") as f:
                content = f.read()
                if not any(marker in content for marker in config.COMMENT_MARKERS):
                    return comments

                # Reset file pointer and continue with line-by-line processing
                f.seek(0)
                lines = f.readlines()
        except UnicodeDecodeError:
            return comments  # Skip binary files

        in_multiline_comment = False
        multiline_content = []

        for line_num, line in enumerate(lines):
            stripped_line = line.strip()
            if not stripped_line:  # Skip empty lines early
                continue

            # Fast path: check if line might contain any comment
            if not any(
                pattern in stripped_line for pattern in single_patterns
            ) and not (
                multiline_pattern
                and (
                    multiline_pattern[0] in stripped_line
                    or multiline_pattern[1] in stripped_line
                )
            ):
                continue

            # Handle multiline comments
            if multiline_pattern:
                start_pattern, end_pattern = multiline_pattern

                if (
                    start_pattern in stripped_line
                    and end_pattern
                    in stripped_line[
                        stripped_line.find(start_pattern) + len(start_pattern) :
                    ]
                ):
                    comment_text = stripped_line[
                        stripped_line.find(start_pattern)
                        + len(start_pattern) : stripped_line.rfind(end_pattern)
                    ].strip()
                    self._process_comment(
                        comment_text, comments, file_path, line_num, lines
                    )
                    continue

                if start_pattern in stripped_line and not in_multiline_comment:
                    in_multiline_comment = True
                    multiline_content = [
                        stripped_line[
                            stripped_line.find(start_pattern) + len(start_pattern) :
                        ].strip()
                    ]
                    continue

                if in_multiline_comment:
                    if end_pattern in stripped_line:
                        in_multiline_comment = False
                        multiline_content.append(
                            stripped_line[: stripped_line.find(end_pattern)].strip()
                        )
                        comment_text = " ".join(multiline_content)
                        self._process_comment(
                            comment_text, comments, file_path, line_num, lines
                        )
                        multiline_content = []
                    else:
                        multiline_content.append(stripped_line)
                    continue

            # Handle single-line comments
            for pattern in single_patterns:
                if pattern in stripped_line:
                    comment_text = stripped_line[
                        stripped_line.find(pattern) + len(pattern) :
                    ].strip()
                    self._process_comment(
                        comment_text, comments, file_path, line_num, lines
                    )
                    break

        return comments

    def _process_comment(
        self,
        comment_text: str,
        comments: List[Dict],
        file_path: Path,
        line_num: int,
        lines: List[str],
    ) -> None:
        """Helper method to process and add valid comments to the comments list."""
        for marker in config.COMMENT_MARKERS:
            if comment_text.startswith(marker) and marker not in self.skip_markers:
                comments.append(
                    {
                        "type": marker,
                        "text": comment_text[len(marker) :].strip(),
                        "file": str(file_path.relative_to(self.workspace_path)),
                        "line": line_num + 1,
                        "context": self.get_context_lines(lines, line_num),
                    }
                )
                break

    def scan_workspace(
        self,
        filename_filter: str = None,
        case_sensitive: bool = True,
        complete_match: bool = False,
    ) -> List[Dict]:
        all_comments = []

        with Progress(
            SpinnerColumn(),
            TextColumn("[progress.description]{task.description}"),
            BarColumn(),
            TaskProgressColumn(),
            TimeElapsedColumn(),
            console=self.console,
        ) as progress:
            # Start with an indeterminate progress bar
            scan_task = progress.add_task("[cyan]Scanning files...", total=None)
            files_processed = 0

            for pattern in config.FILE_PATTERNS:
                try:
                    for file_path in self.workspace_path.rglob(pattern):
                        if file_path.is_file() and not self.should_skip_path(
                            file_path, filename_filter, case_sensitive, complete_match
                        ):
                            try:
                                files_processed += 1
                                progress.update(
                                    scan_task,
                                    completed=files_processed,
                                    description=f"[cyan]Scanning: {file_path.name}",
                                )
                                file_comments = self.scan_file(file_path)
                                if file_comments:  # Only extend if we found comments
                                    all_comments.extend(file_comments)
                            except Exception as e:
                                self.console.print(
                                    f"Error scanning {file_path}: {e}", style="red"
                                )
                except Exception as e:
                    self.console.print(f"Error during workspace scan: {e}", style="red")

        return all_comments

    def display_comments(self, comments: List[Dict]):
        table = Table(title="Project Comments Overview", show_lines=True)

        table.add_column("Type", style="bold")
        table.add_column("Comment")
        if self.show_context:
            table.add_column("Context", style="dim")
        table.add_column("File", style="dim")
        table.add_column("Line", style="dim")

        for comment in sorted(comments, key=lambda x: x["type"]):
            row = [
                config.COMMENT_MARKERS[comment["type"]],
                comment["text"],
                comment["file"],
                str(comment["line"]),
            ]
            if self.show_context:
                row.insert(2, comment["context"])

            table.add_row(
                *row, style=config.COMMENT_COLORS.get(comment["type"], "white")
            )

        self.console.print(table)

    def export_to_pdf(self, comments: List[Dict], output_path: str):
        class PDF(FPDF):
            def multi_cell_row(self, heights, cols, border=1):
                # Calculate max number of lines for all columns
                max_lines = 0
                lines = []
                # Adjust widths based on whether context is shown
                if self.show_context:
                    widths = [20, 60, 60, 60, 20]  # Type, Comment, Context, File, Line
                else:
                    widths = [20, 60, 60, 20]  # Type, Comment, File, Line

                x_start = self.get_x()
                for i, col in enumerate(cols):
                    self.set_x(x_start)
                    lines.append(
                        self.multi_cell(
                            widths[i], heights, col, border=border, split_only=True
                        )
                    )
                    max_lines = max(max_lines, len(lines[-1]))

                # Draw multi-cells with same height
                height_of_line = heights
                x_start = self.get_x()
                for i in range(max_lines):
                    self.set_x(x_start)
                    for j, width in enumerate(widths):
                        content = lines[j][i] if i < len(lines[j]) else ""
                        self.multi_cell(width, height_of_line, content, border=border)
                        self.set_xy(self.get_x() + width, self.get_y() - height_of_line)
                    self.ln(height_of_line)

                return max_lines * height_of_line

        pdf = PDF()
        pdf.set_auto_page_break(auto=True, margin=15)
        pdf.add_page()
        pdf.set_font("Arial", size=10)
        pdf.show_context = self.show_context

        # Add title
        pdf.set_font("Arial", "B", 14)
        pdf.cell(0, 10, "Project Comments Overview", ln=True, align="C")
        pdf.ln(5)
        pdf.set_font("Arial", size=10)

        # Headers
        headers = ["Type", "Comment", "File", "Line"]
        if self.show_context:
            headers.insert(2, "Context")

        pdf.set_fill_color(240, 240, 240)
        pdf.multi_cell_row(8, headers)

        # Content
        for comment in sorted(comments, key=lambda x: x["type"]):
            try:
                row = [
                    config.COMMENT_MARKERS[comment["type"]],
                    comment["text"],
                    comment["file"],
                    str(comment["line"]),
                ]

                if self.show_context:
                    # Clean up context for PDF compatibility
                    context = comment["context"]
                    context = context.replace("→", ">")
                    context = context.encode("ascii", "replace").decode("ascii")
                    context = context.replace(
                        "\n", " | "
                    )  # Replace line breaks with separator
                    row.insert(2, context)

                # Clean up all cells for PDF compatibility
                row = [
                    str(cell).encode("ascii", "replace").decode("ascii") for cell in row
                ]

                pdf.multi_cell_row(8, row)

            except Exception as e:
                self.console.print(
                    f"Warning: Skipped row due to encoding issue: {e}", style="yellow"
                )

        pdf.output(output_path)

    def export_to_excel(self, comments: List[Dict], output_path: str):
        df_data = []
        for comment in comments:
            row = {
                "Type": config.COMMENT_MARKERS[comment["type"]],
                "Comment": comment["text"],
                "File": comment["file"],
                "Line": comment["line"],
            }
            if self.show_context:
                row["Context"] = comment["context"]
            df_data.append(row)

        df = pd.DataFrame(df_data)
        df.to_excel(output_path, index=False, engine="openpyxl")


def main():
    parser = argparse.ArgumentParser(
        description="Scan TypeScript project comments",
        epilog="""
Examples:
  %(prog)s                                    # Scan all files with default settings
  %(prog)s -w /path/to/project               # Scan a specific workspace
  %(prog)s -f test.py                        # Find comments in files containing 'test.py' (case insensitive)
  %(prog)s -f test.py -c                     # Find comments in files named exactly 'test.py'
  %(prog)s -f Test.py -C                     # Find comments with case-sensitive filename match
  %(prog)s -f test.py -c -C                  # Find comments in files named exactly 'test.py' (case sensitive)
  %(prog)s --skip TODO FIXME                 # Skip TODO and FIXME comments
  %(prog)s -a                                # Include all comment types
  %(prog)s -e pdf -o comments.pdf            # Export comments to PDF
        """,
        formatter_class=argparse.RawDescriptionHelpFormatter,
    )
    parser.add_argument(
        "--workspace", "-w", type=str, help="Path to the workspace directory"
    )
    parser.add_argument(
        "--skip",
        "-s",
        type=str,
        nargs="+",
        help="Markers to skip (e.g., --skip NOTE TODO)",
        default=list(config.DEFAULT_SKIP_MARKERS),
    )
    parser.add_argument(
        "--include-all",
        "-a",
        action="store_true",
        help="Include all markers (override default skip)",
    )
    parser.add_argument(
        "--no-context",
        "-nc",
        action="store_true",
        help="Don't show context lines around comments",
    )
    parser.add_argument(
        "--export",
        "-e",
        type=str,
        choices=config.EXPORT_FORMATS,
        help="Export format (pdf or xlsx)",
    )
    parser.add_argument("--output", "-o", type=str, help="Output file path for export")

    # Create a filename filter group
    filename_group = parser.add_argument_group("filename filtering")
    filename_group.add_argument(
        "--filename",
        "-f",
        type=str,
        help="Filter files by filename (case insensitive by default)",
    )
    filename_group.add_argument(
        "--complete-match",
        "-c",
        action="store_true",
        help="Match complete filename instead of partial (only with -f)",
    )
    filename_group.add_argument(
        "--case-sensitive",
        "-C",
        action="store_true",
        help="Make filename filter case sensitive (only with -f)",
    )

    args = parser.parse_args()

    # Update validation
    if args.case_sensitive and not args.filename:
        parser.error("--case-sensitive can only be used with --filename")
    if args.complete_match and not args.filename:
        parser.error("--complete-match can only be used with --filename")

    try:
        skip_markers = set() if args.include_all else set(args.skip)
        scanner = CommentScanner(
            args.workspace, skip_markers, show_context=not args.no_context
        )
        comments = scanner.scan_workspace(
            filename_filter=args.filename,
            case_sensitive=args.case_sensitive,
            complete_match=args.complete_match,
        )

        if not comments:
            scanner.console.print("No comments found!", style="yellow")
            return

        # Display in console
        scanner.display_comments(comments)

        # Export if requested
        if args.export:
            if not args.output:
                raise ValueError("Output path (-o) is required when exporting")

            if args.export == "pdf":
                scanner.export_to_pdf(comments, args.output)
            elif args.export == "xlsx":
                scanner.export_to_excel(comments, args.output)

            scanner.console.print(f"\nExported to {args.output}", style="green")

    except Exception as e:
        Console().print(f"Error: {str(e)}", style="red")


if __name__ == "__main__":
    main()