Spaces:
				
			
			
	
			
			
		Runtime error
		
	
	
	
			
			
	
	
	
	
		
		
		Runtime error
		
	| import random | |
| import requests | |
| import os, glob | |
| # english literature | |
| books = [ | |
| 'https://www.gutenberg.org/cache/epub/1513/pg1513.txt', | |
| 'https://www.gutenberg.org/files/2701/2701-0.txt', | |
| 'https://www.gutenberg.org/cache/epub/84/pg84.txt', | |
| 'https://www.gutenberg.org/cache/epub/2641/pg2641.txt', | |
| 'https://www.gutenberg.org/cache/epub/1342/pg1342.txt', | |
| 'https://www.gutenberg.org/cache/epub/100/pg100.txt' | |
| ] | |
| #default english | |
| # allowed_chars = ' abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789!@#$%^&*()-_+=\"\':;[]{}/<>,.`~\n\\' | |
| #german | |
| allowed_chars = ' aäbcdefghijklmnoöpqrsßtuüvwxyzABCDEFGHIJKLMNOÖPQRSTUÜVWXYZ0123456789!@#$%^&*()-_+=\"\':;[]{}/<>,.`~\n\\' | |
| def download_book(book): | |
| return requests.get(book).content.decode('utf-8') | |
| def filter_data(data): | |
| print('Filtering data') | |
| return ''.join([char for char in data if char in allowed_chars]) | |
| def load_books(fromfolder=False): | |
| text_data = [] | |
| if fromfolder: | |
| current_working_directory = os.getcwd() | |
| print(current_working_directory) | |
| path = 'text' | |
| for filename in glob.glob(os.path.join(path, '*.txt')): | |
| with open(os.path.join(os.getcwd(), filename), 'r') as f: # open in readonly mode | |
| print(f'Loading {filename}') | |
| text_data.append(filter_data(str(f.read()))) | |
| else: | |
| print(f'Loading {len(books)} books into ram') | |
| for book in books: | |
| text_data.append(filter_data(str(download_book(book)))) | |
| print('Loaded books') | |
| return ' '.join(text_data) | |
| def random_split_chunk(data, size=14): | |
| data = data.split(' ') | |
| index = random.randrange(0, len(data)) | |
| return ' '.join(data[index:index+size]) | |
