Source code for contextbench.parsers.diff

"""Parse unified diffs to extract edit locations."""

import os
import re
from typing import Dict, List, Tuple
from ..core import ByteInterval, merge
from ..core.fileio import line_to_byte

[docs] def parse_diff(diff_text: str, repo_dir: str) -> Dict[str, List[ByteInterval]]: """Extract edited byte ranges per file from unified diff.""" edits = _parse_hunks(diff_text) result = {} for file_path, line_ranges in edits.items(): abs_path = os.path.join(repo_dir, file_path) if not os.path.exists(abs_path): continue intervals = [] for start, end, _ in line_ranges: span = line_to_byte(abs_path, start, end) if span: intervals.append(span) if intervals: result[file_path] = merge(intervals) return result
def parse_diff_lines(diff_text: str, deletions_only: bool = False) -> Dict[str, List[Tuple[int, int]]]: """Extract edited line ranges per file from unified diff. Args: deletions_only: If True, only extract deleted lines (for EditLoc). If False, extract added/modified lines (for repair evaluation). Returns {file: [(start_line, end_line)]} where lines are inclusive. """ hunks = _parse_hunks(diff_text, deletions_only=deletions_only) result = {} for file_path, line_ranges in hunks.items(): line_intervals = [] for start, end, _ in line_ranges: line_intervals.append((start, end)) if line_intervals: result[file_path] = _merge_line_intervals(line_intervals) return result def _merge_line_intervals(intervals: List[Tuple[int, int]]) -> List[Tuple[int, int]]: """Merge overlapping or adjacent line intervals.""" if not intervals: return [] sorted_intervals = sorted(intervals) merged = [sorted_intervals[0]] for current in sorted_intervals[1:]: last = merged[-1] if current[0] <= last[1] + 1: merged[-1] = (last[0], max(last[1], current[1])) else: merged.append(current) return merged def _parse_hunks(diff_text: str, deletions_only: bool = False) -> Dict[str, List[Tuple[int, int, str]]]: """Parse diff hunks. Returns {file: [(start_line, end_line, change_type)]}. Args: deletions_only: If True, extract deleted lines (OLD file line numbers). If False, extract added/modified lines (NEW file line numbers). """ result = {} current_file = None lines = diff_text.split('\n') hunk_re = re.compile(r'^@@\s+-(\d+)(?:,(\d+))?\s+\+(\d+)(?:,(\d+))?\s+@@') for i, line in enumerate(lines): if line.startswith('+++'): match = re.match(r'^\+\+\+\s+b/(.+)', line) if match: current_file = match.group(1) elif line.startswith('@@') and current_file: match = hunk_re.match(line) if match: old_start = int(match.group(1)) new_start = int(match.group(3)) if deletions_only: # Extract deleted lines (OLD file coordinates) j = i + 1 old_line = old_start edit_start = None while j < len(lines) and not lines[j].startswith(('@@', 'diff ', '---')): if lines[j].startswith('-') and not lines[j].startswith('---'): # Deleted line in OLD file if edit_start is None: edit_start = old_line old_line += 1 elif lines[j].startswith('+') and not lines[j].startswith('+++'): # Added line - skip for deletions_only pass elif lines[j].startswith(' '): # Context line - ends current deletion block if edit_start is not None: result.setdefault(current_file, []).append( (edit_start, old_line - 1, 'del')) edit_start = None old_line += 1 j += 1 # Handle deletion block at end of hunk if edit_start is not None: result.setdefault(current_file, []).append( (edit_start, old_line - 1, 'del')) else: # Extract added/modified lines (NEW file coordinates) j = i + 1 new_line = new_start edit_start = None while j < len(lines) and not lines[j].startswith(('@@', 'diff ', '---')): if lines[j].startswith('+') and not lines[j].startswith('+++'): # Added or modified line in new file if edit_start is None: edit_start = new_line new_line += 1 elif lines[j].startswith('-') and not lines[j].startswith('---'): # Deleted line - skip for additions pass elif lines[j].startswith(' '): # Context line - ends current edit block if edit_start is not None: result.setdefault(current_file, []).append( (edit_start, new_line - 1, 'add')) edit_start = None new_line += 1 j += 1 # Handle edit block at end of hunk if edit_start is not None: result.setdefault(current_file, []).append( (edit_start, new_line - 1, 'add')) return result