File size: 2,687 Bytes
5c51375 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 |
"""Diff visualization utilities."""
import difflib
from typing import Iterator
def generate_html_diff(original: str, cleaned: str) -> str:
"""Generate HTML diff between original and cleaned text."""
differ = difflib.HtmlDiff(tabsize=2, wrapcolumn=80)
original_lines = original.splitlines(keepends=True)
cleaned_lines = cleaned.splitlines(keepends=True)
html_diff = differ.make_file(
original_lines,
cleaned_lines,
"Original Text",
"Cleaned Text",
context=True,
numlines=3
)
return _style_diff_html(html_diff)
def _style_diff_html(html_diff: str) -> str:
"""Add custom styling to the diff HTML."""
custom_style = """
<style>
.diff {
font-family: 'Monaco', 'Menlo', 'Ubuntu Mono', monospace;
font-size: 12px;
line-height: 1.4;
border: 1px solid #ddd;
border-radius: 8px;
overflow: auto;
max-height: 600px;
}
.diff table {
width: 100%;
border-collapse: collapse;
margin: 0;
}
.diff td {
padding: 2px 8px;
vertical-align: top;
white-space: pre-wrap;
word-wrap: break-word;
}
.diff_header {
background: #f8f9fa;
font-weight: bold;
text-align: center;
border-bottom: 1px solid #dee2e6;
}
.diff_next {
background: #e9ecef;
text-align: center;
font-size: 10px;
}
.diff_add {
background: #d4edda;
color: #155724;
}
.diff_chg {
background: #fff3cd;
color: #856404;
}
.diff_sub {
background: #f8d7da;
color: #721c24;
}
</style>
"""
# Insert custom style before closing </head> tag
styled_html = html_diff.replace('</head>', f'{custom_style}</head>')
# Add diff class to the table
styled_html = styled_html.replace('<table class="diff"', '<div class="diff"><table class="diff"')
styled_html = styled_html.replace('</table>', '</table></div>')
return styled_html
def get_diff_stats(original: str, cleaned: str) -> dict[str, int]:
"""Get statistics about the diff between original and cleaned text."""
differ = difflib.SequenceMatcher(None, original, cleaned)
return {
'original_length': len(original),
'cleaned_length': len(cleaned),
'similarity_ratio': round(differ.ratio() * 100, 2),
'characters_removed': len(original) - len(cleaned)
} |