Search_Engine/importanttext.py
2022-05-06 17:19:37 -07:00

31 lines
779 B
Python

# You can ignore this file. This was for testing purposes
import json
import os
import shelve
from bs4 import BeautifulSoup
from time import perf_counter
import requests
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
import numpy as np
path_to_script = os.path.dirname(os.path.abspath(__file__))
my_filename = os.path.join(path_to_script, "testfile.json")
url = "https://www.crummy.com/software/BeautifulSoup/bs4/doc/"
req = requests.get(url)
file = open('D:/Visual Studio Workspace/CS121/assignment3/Search_Engine/testfile.json')
content = json.load(file)
soup = BeautifulSoup(content["content"], 'lxml')
bold = []
#print(soup.prettify())
print(soup.findAll('h3'))
for i in soup.findAll('title'):
print(word_tokenize(i.text))
print(bold)