summarizer_preprocessing / custom_functions.py
cdgranadillo's picture
Create custom_functions.py
3e39b83
raw
history blame
689 Bytes
import re
def preprocessing_text(text):
list_separators=re.findall(r"Comprador*.?\(.*?\)|Mediador*.?\(.*?\)|Vendedor*.?\(.*?\)", text)
list_splits=re.split(r"Comprador*.?\(.*?\)|Mediador*.?\(.*?\)|Vendedor*.?\(.*?\)", text)[1:]
conversation=list(map(lambda a, b: a+b, list_separators, list_splits))
#remove text sent to seller
[conversation.remove(conv) for conv in conversation if "para Vendedor" in conv.split("\n",2)[0]]
final_text=""
for text in conversation:
text=re.sub(r".*?\.jpeg|.*?\.jpg|.*?\.png|.*?\.pdf","",text)
new_text=text.split(" ")[0]+":"+text.split("\n",2)[2]
final_text=final_text+new_text
return final_text