view_essentialweb_cleaned / diff_utils.py
sumuks's picture
Create diff_utils.py
5c51375 verified
raw
history blame
2.69 kB
"""Diff visualization utilities."""
import difflib
from typing import Iterator
def generate_html_diff(original: str, cleaned: str) -> str:
"""Generate HTML diff between original and cleaned text."""
differ = difflib.HtmlDiff(tabsize=2, wrapcolumn=80)
original_lines = original.splitlines(keepends=True)
cleaned_lines = cleaned.splitlines(keepends=True)
html_diff = differ.make_file(
original_lines,
cleaned_lines,
"Original Text",
"Cleaned Text",
context=True,
numlines=3
)
return _style_diff_html(html_diff)
def _style_diff_html(html_diff: str) -> str:
"""Add custom styling to the diff HTML."""
custom_style = """
<style>
.diff {
font-family: 'Monaco', 'Menlo', 'Ubuntu Mono', monospace;
font-size: 12px;
line-height: 1.4;
border: 1px solid #ddd;
border-radius: 8px;
overflow: auto;
max-height: 600px;
}
.diff table {
width: 100%;
border-collapse: collapse;
margin: 0;
}
.diff td {
padding: 2px 8px;
vertical-align: top;
white-space: pre-wrap;
word-wrap: break-word;
}
.diff_header {
background: #f8f9fa;
font-weight: bold;
text-align: center;
border-bottom: 1px solid #dee2e6;
}
.diff_next {
background: #e9ecef;
text-align: center;
font-size: 10px;
}
.diff_add {
background: #d4edda;
color: #155724;
}
.diff_chg {
background: #fff3cd;
color: #856404;
}
.diff_sub {
background: #f8d7da;
color: #721c24;
}
</style>
"""
# Insert custom style before closing </head> tag
styled_html = html_diff.replace('</head>', f'{custom_style}</head>')
# Add diff class to the table
styled_html = styled_html.replace('<table class="diff"', '<div class="diff"><table class="diff"')
styled_html = styled_html.replace('</table>', '</table></div>')
return styled_html
def get_diff_stats(original: str, cleaned: str) -> dict[str, int]:
"""Get statistics about the diff between original and cleaned text."""
differ = difflib.SequenceMatcher(None, original, cleaned)
return {
'original_length': len(original),
'cleaned_length': len(cleaned),
'similarity_ratio': round(differ.ratio() * 100, 2),
'characters_removed': len(original) - len(cleaned)
}