Spaces:
Sleeping
Sleeping
"""Diff visualization utilities.""" | |
import difflib | |
from typing import Iterator | |
def generate_html_diff(original: str, cleaned: str) -> str: | |
"""Generate HTML diff between original and cleaned text.""" | |
differ = difflib.HtmlDiff(tabsize=2, wrapcolumn=80) | |
original_lines = original.splitlines(keepends=True) | |
cleaned_lines = cleaned.splitlines(keepends=True) | |
html_diff = differ.make_file( | |
original_lines, | |
cleaned_lines, | |
"Original Text", | |
"Cleaned Text", | |
context=True, | |
numlines=3 | |
) | |
return _style_diff_html(html_diff) | |
def _style_diff_html(html_diff: str) -> str: | |
"""Add custom styling to the diff HTML.""" | |
custom_style = """ | |
<style> | |
.diff { | |
font-family: 'Monaco', 'Menlo', 'Ubuntu Mono', monospace; | |
font-size: 12px; | |
line-height: 1.4; | |
border: 1px solid #ddd; | |
border-radius: 8px; | |
overflow: auto; | |
max-height: 600px; | |
} | |
.diff table { | |
width: 100%; | |
border-collapse: collapse; | |
margin: 0; | |
} | |
.diff td { | |
padding: 2px 8px; | |
vertical-align: top; | |
white-space: pre-wrap; | |
word-wrap: break-word; | |
} | |
.diff_header { | |
background: #f8f9fa; | |
font-weight: bold; | |
text-align: center; | |
border-bottom: 1px solid #dee2e6; | |
} | |
.diff_next { | |
background: #e9ecef; | |
text-align: center; | |
font-size: 10px; | |
} | |
.diff_add { | |
background: #d4edda; | |
color: #155724; | |
} | |
.diff_chg { | |
background: #fff3cd; | |
color: #856404; | |
} | |
.diff_sub { | |
background: #f8d7da; | |
color: #721c24; | |
} | |
</style> | |
""" | |
# Insert custom style before closing </head> tag | |
styled_html = html_diff.replace('</head>', f'{custom_style}</head>') | |
# Add diff class to the table | |
styled_html = styled_html.replace('<table class="diff"', '<div class="diff"><table class="diff"') | |
styled_html = styled_html.replace('</table>', '</table></div>') | |
return styled_html | |
def get_diff_stats(original: str, cleaned: str) -> dict[str, int]: | |
"""Get statistics about the diff between original and cleaned text.""" | |
differ = difflib.SequenceMatcher(None, original, cleaned) | |
return { | |
'original_length': len(original), | |
'cleaned_length': len(cleaned), | |
'similarity_ratio': round(differ.ratio() * 100, 2), | |
'characters_removed': len(original) - len(cleaned) | |
} |