Spaces:
Sleeping
Sleeping
| import streamlit as st | |
| import subprocess | |
| import os | |
| from jsonToText import convert_json_to_text | |
| from llm import process_and_save_json | |
| # Default XPath if not provided by the user | |
| DEFAULT_XPATH = '//body' # You can change this to whatever default XPath you prefer | |
| # Function to update the spider with user inputs and run it | |
| def run_spider(website_url, xpath): | |
| # Extract domain from the website URL | |
| domain = website_url.split("//")[-1].split("/")[0] | |
| # Update the spider file with user input (start_urls, custom_xpath, and allowed_domains) | |
| spider_path = 'webscraper/webscraper/spiders/websiteSpider.py' | |
| # Read the spider file | |
| with open(spider_path, 'r') as file: | |
| spider_code = file.readlines() | |
| # Modify start_urls, custom_xpath, and allowed_domains | |
| for idx, line in enumerate(spider_code): | |
| if line.strip().startswith('start_urls ='): | |
| spider_code[idx] = f' start_urls = ["{website_url}"]\n' | |
| if line.strip().startswith('custom_xpath ='): | |
| spider_code[idx] = f' custom_xpath = "{xpath}"\n' | |
| if line.strip().startswith('allowed_domains ='): | |
| spider_code[idx] = f' allowed_domains = ["{domain}"]\n' | |
| # Write back the modified spider code | |
| with open(spider_path, 'w') as file: | |
| file.writelines(spider_code) | |
| # Run the Scrapy spider using subprocess | |
| scrapy_command = f'scrapy crawl websiteSpider' | |
| spider_dir = 'webscraper/webscraper' # Set the directory where the spider is located | |
| subprocess.run(scrapy_command, cwd=spider_dir, shell=True) | |
| # Streamlit UI | |
| st.title('Web Scraper Interface') | |
| # User input for website link and XPath | |
| website_url = st.text_input('Enter the website URL:', '') | |
| xpath = st.text_input('Enter the XPath:', DEFAULT_XPATH) # Default XPath if not provided | |
| # Variable to check if spider has been run | |
| spider_ran = False | |
| # Button to run the spider | |
| if st.button('Run Spider'): | |
| if website_url: | |
| st.write(f'Running the spider on {website_url} using XPath: {xpath}') | |
| run_spider(website_url, xpath) | |
| st.success('Spider finished running!') | |
| convert_json_to_text("webscraper/webscraper/scraped.json", "output.txt") | |
| spider_ran = True | |
| else: | |
| st.error('Please provide a website URL.') | |
| # If spider has been run, show download buttons | |
| if spider_ran or os.path.exists("webscraper/webscraper/scraped.json"): | |
| # Add an option to download the output.txt file | |
| with open("output.txt", "r") as file: | |
| st.download_button( | |
| label="Download Output Text", | |
| data=file, | |
| file_name="output.txt", | |
| mime="text/plain" | |
| ) | |
| # Add an option to download the scraped.json file | |
| with open("webscraper/webscraper/scraped.json", "r") as json_file: | |
| st.download_button( | |
| label="Download Scraped JSON", | |
| data=json_file, | |
| file_name="scraped.json", | |
| mime="application/json" | |
| ) | |
| # Title for organizing section | |
| st.title("Do you want to organize the scraped data?") | |
| # Use session state to track if the user has clicked "Yes" | |
| if "organize_requested" not in st.session_state: | |
| st.session_state.organize_requested = False | |
| # Button to toggle the organize section | |
| if st.button("Yes"): | |
| st.session_state.organize_requested = True | |
| # If user clicked "Yes", show input for 'about', 'API key', and organize button | |
| if st.session_state.organize_requested: | |
| # Input for 'about' to describe the data for organizing | |
| about = st.text_input('Enter one or two words that describe the data like "books" or "events":', '') | |
| # Input for custom details to organize the data (e.g., name, price, stock) | |
| details = st.text_input('Enter the details to extract (comma separated) like name, date', '') | |
| # Input for the API key | |
| api_key = st.text_input('Enter your Groq API key:', type="password") | |
| # Button to organize and save JSON | |
| if st.button("Organize"): | |
| if about and details and api_key: | |
| # Convert comma-separated details into a list | |
| details_list = [detail.strip() for detail in details.split(',')] | |
| # Process and save the JSON with the provided details and API key | |
| process_and_save_json("output.txt", "organize.json", api_key=api_key, about=about, details=details_list) | |
| st.success('Data has been organized and saved to organize.json.') | |
| # Add an option to download the organized JSON file | |
| with open("organize.json", "r") as organized_json_file: | |
| st.download_button( | |
| label="Download Organized JSON", | |
| data=organized_json_file, | |
| file_name="organize.json", | |
| mime="application/json" | |
| ) | |
| else: | |
| st.error('Please provide a description, details, and your API key before organizing.') | |