Basic 2 Questions

Question

Basic 2 Questions

batuhan3526 opened this issue a year ago · comments

Hello, I am very new in this field. If I understood correctly, it pulled the string data from all the links in the txt with the py code and stored it in another txt. You have also trained with this string store, txt, is it true?

Theoretically, I need to change the names of the txt files with the name of my research location and then insert the links of my research location. Can I theoretically train my own model with the txt I will obtain later? I'm wondering if this is possible.

If all this is possible, I will devote myself to getting the links of all the pdf's in the arxiv related to my research topic and converting them to txt in a good way.

Erik · Answer 1 · Mon May 01 2023 23:04:08 GMT+0800 (China Standard Time)

There's no url fetcher code. Here is the one I made for unity3d docs, just modify the urls in the code:

import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin

def get_soup(url):
    try:
        response = requests.get(url, timeout=10)  # Added a timeout of 10 seconds
        if response.status_code == 200:
            soup = BeautifulSoup(response.content, 'html.parser')
            return soup
        else:
            print(f"Error {response.status_code}: Failed to fetch the URL {url}")
            return None
    except requests.exceptions.RequestException as e:
        print(f"Error: Failed to fetch the URL {url} due to {str(e)}")
        return None


def extract_links(soup, base_url):
    links = set()
    for link in soup.find_all('a'):
        href = link.get('href')
        if href:
            full_url = urljoin(base_url, href)
            links.add(full_url)
    return links

def save_url_to_file(url):
    with open('urls.txt', 'a') as f:
        f.write(f"{url}\n")

def main():
    start_url = 'https://docs.unity3d.com/2022.2/Documentation/Manual/index.html'  # Replace with the website you want to scrape
    visited_links = set()
    to_visit_links = {start_url}

    while to_visit_links:
        current_url = to_visit_links.pop()
        # Replace with the url to scrape only - if you use http:// and https:// it will take for ever!
        if current_url.startswith(('http://docs.unity3d.com/2022.2', 'https://docs.unity3d.com/2022.2')) and current_url not in visited_links:
            visited_links.add(current_url)
            print(f"Visiting: {current_url}")
            save_url_to_file(current_url)

            soup = get_soup(current_url)
            if soup:
                new_links = extract_links(soup, current_url)
                to_visit_links.update(new_links)


if __name__ == "__main__":
    main()```

Batuhan Bayraktar · Answer 2 · Tue May 02 2023 06:54:40 GMT+0800 (China Standard Time)

Hi again. I used this code for binance documention. After gathering binance documentıon ı got txt named "binance_data_urls.txt". Thif file includes many url for binance doc. Then ı created txt file named "binance_data.txt". Later ı run the code ı got the binance_data.log

import requests
from bs4 import BeautifulSoup
import logging
import time
from selenium import webdriver
from selenium.webdriver.firefox.options import Options
import re

# Set up logging configuration
logging.basicConfig(filename='binance_data.log', level=logging.INFO, filemode='w') # log for binance 

# Function to fetch the HTML content of a given URL
def fetch_page_content(url):
    try:
        options = Options()
        options.add_argument('--headless')
        driver = webdriver.Firefox(options=options)
        driver.get(url)
        time.sleep(10)
        content = driver.page_source
        driver.quit()
        return content
    except Exception as e:
        logging.error(f"Error fetching {url}: {e}")

# Function to parse the HTML content and extract the page text
def parse_html(content, url):
    soup = BeautifulSoup(content, 'html.parser')
    # print(soup) # For debugging purposes only
    body_div = soup.find("div", {"id": "maincol"})
    if not body_div:
        logging.warning(f"body div not successfully located in {url}")
    #print(body_div) # For debugging purposes only
    elements = body_div.find_all(['p', 'h1', 'h2', 'h3'])
    if not elements:
        logging.warning(f"No tags found on {url}")
        return
    # Extract the text from each paragraph and remove consecutive whitespace characters
    texts = [re.sub(r'\s+', ' ', e.get_text().strip()) for e in elements]
    return '\n\n'.join(texts) + '\n\n'

# Main function to loop through the list of page URLs, fetch and parse their content,
# extract the content, and write it to a file
def main():
    # URLs of the pages to include in the .txt file
    with open('binance_data_urls.txt', 'r') as f:
        page_urls = f.read().splitlines()

    # Initialize a string to store the content
    docs_text = ''

    # Loop through the list of page URLs
    for i, url in enumerate(page_urls):
        # Log the current page being processed
        logging.info(f"Processing page {i+1}: {url}")

        # Fetch the HTML content of the page
        content = fetch_page_content(url)

        # If the content could not be fetched, continue to the next page
        if not content:
            continue

        # Parse the HTML content and extract the maincol div
        paragraphs = parse_html(content, url)

        # If the page container could not be found, continue to the next page
        if not paragraphs:
            continue

        # Append the content to the string storing all content
        docs_text += paragraphs

        print(f'Successfully parsed page {i+1}')

    # Write the content to a file
    try:
        with open('binance_data.txt', 'w', encoding='utf-8') as f: #unreal_docs.txt  # For binance docs data  right ? 
            f.write(docs_text)
            logging.info(f"Successfully wrote {len(page_urls)} pages to file.")
    except OSError as e:
        logging.error(f"Error writing to file: {e}")
    
    print("Script execution completed successfully.")

if __name__ == '__main__':
    main()

Erik · Answer 3 · Tue May 02 2023 07:45:16 GMT+0800 (China Standard Time)

You have to put the firefox driver in the root folder (same as binance_data_urls.txt) , here is the link to download it https://github.com/mozilla/geckodriver/releases. This is my modified version, I added comments to the parts you'll want to change.

It uses Pool so it parses faster
Saves the contents in batches

import requests_cache
from bs4 import BeautifulSoup
import logging
import time
from selenium import webdriver
from selenium.webdriver.firefox.options import Options
import re
from multiprocessing import Pool


logging.basicConfig(filename='unity_docs.log', level=logging.INFO, filemode='w')


requests_cache.install_cache('web_cache')


def fetch_page_content(url):
    try:
        options = Options()
        options.add_argument('--headless')
        driver = webdriver.Firefox(options=options)
        driver.get(url)
        time.sleep(10)
        content = driver.page_source
        driver.quit()
        return content
    except Exception as e:
        logging.error(f"Error fetching {url}: {e}")


def parse_html(content, url):
    soup = BeautifulSoup(content, 'html.parser')
    body_div = soup.find("div", {"id": "content-wrap"}) #change the div to what you need to parse
    if not body_div:
        logging.warning(f"body div not successfully located in {url}")
        return None
    #elements = body_div.contents #Keep this one if you just want everything inside the div above
    elements = body_div.find_all(lambda tag: tag.name and tag.parent == body_div and not tag.attrs.get("class") in [["scrollToFeedback"], ["suggest"], ["footer-wrapper"]]) #This one excludes the following classes
    
    if not elements:
        logging.warning(f"No tags found on {url}")
        return
    texts = [re.sub(r'\s+', ' ', e.get_text().strip()) for e in elements]
    return '\n\n'.join(texts) + '\n\n'

def process_url(url):
    content = fetch_page_content(url)
    if not content:
        logging.warning(f"No content fetched for {url}")
        return

    paragraphs = parse_html(content, url)
    if not paragraphs:
        logging.warning(f"No paragraphs found for {url}")
        return

    return paragraphs + f"////{url}////\n"


# Main function to loop through the list of page URLs, fetch and parse their content,
# extract the content, and write it to a file
def main():
    with open('urls.txt', 'r') as f:
        page_urls = f.read().splitlines()

    batch_size = 100 #change batch to whatever size you want

    for i in range(0, len(page_urls), batch_size):
        with Pool() as pool:
            results = pool.map(process_url, page_urls[i:i + batch_size])

        docs_text = ''.join(filter(None, results))

        try:
            with open(f'output_{i // batch_size + 1}.txt', 'w', encoding='utf-8') as f:
                f.write(docs_text)
                logging.info(f"Successfully wrote {len(results)} pages to file.")
        except OSError as e:
            logging.error(f"Error writing to file: {e}")

    print("Script execution completed successfully.")

if __name__ == '__main__':
    main()