import re from collections import Counter from bs4 import BeautifulSoup from output_13_RP import get_top_k_words_from_html class Testget_top_k_words_from_html: def test_basic_html(self): html = "This is a simple HTML text. Hello world!" assert get_top_k_words_from_html(html, 3) == [('simple', 1), ('html', 1), ('text', 1)] def test_html_with_tags(self): html = "

Title

is a paragraph.

" assert get_top_k_words_from_html(html, 4) == [('title', 1), ('paragraph', 1)] def test_html_with_special_characters(self): html = "Encoding: ٠١٢" assert get_top_k_words_from_html(html, 3) == [('encoding', 1), ('٠١٢', 1)] def test_html_with_unicode_characters(self): html = "Unicode: àéîőű" assert get_top_k_words_from_html(html, 4) == [('unicode', 1), ('àéîőű', 1)] def test_html_with_duplicate_words(self): html = "Repeated words test. This is a test repeated words." assert get_top_k_words_from_html(html, 4) == [('repeated', 2), ('words', 2), ('test', 2)] def test_large_html(self): # Test with a large HTML content html = "" + "word1 " * 1000 + "" assert get_top_k_words_from_html(html, 1) == [('word1', 1000)] def test_html_with_numbers(self): html = "Numbers: 123 456 789" assert get_top_k_words_from_html(html, 3) == [('numbers', 1), ('123', 1), ('456', 1)] def test_html_with_unicode_characters_and_numbers(self): html = "Unicode and numbers: àéîőű 12345" assert get_top_k_words_from_html(html, 4) == [('unicode', 1), ('numbers', 1), ('àéîőű', 1), ('12345', 1)]