ScrapeGraphAI / Scrapegraph-ai

Python scraper based on AI

Home Page:https://scrapegraphai.com

Geek Repo:Geek Repo

Github PK Tool:Github PK Tool

'SmartScraperGraph' object has no attribute 'model_token'

Naman-Bhrgv opened this issue · comments

Describe the bug
Hi,
I am trying to scrape webpage using SmartScraperGraph, but am constantly getting the following error-

'SmartScraperGraph' object has no attribute 'model_token'

To Reproduce

This is the code-

repo_id = "meta-llama/Llama-2-7b-hf"

llm_model_instance = HuggingFaceEndpoint(
    repo_id=repo_id,  max_length=128, temperature=0.5, huggingfacehub_api_token='MY_API_TOKEN')


embedder_model_instance = HuggingFaceInferenceAPIEmbeddings(
    api_key='MY_API_TOKEN', model_name="sentence-transformers/all-MiniLM-l6-v2"
)

from scrapegraphai.graphs import SmartScraperGraph

graph_config = {
    "llm": {"model_instance": llm_model_instance},
    "embeddings": {"model_instance": embedder_model_instance}
}


smart_scraper_graph = SmartScraperGraph(
    prompt="List me all the events, with the following fields: company_name, event_name, event_start_date, event_start_time, event_end_date, event_end_time, location, event_mode, event_category, third_party_redirect, no_of_days, time_in_hours, hosted_or_attending, refreshments_type,  registration_available, registration_link",
    # also accepts a string with the already downloaded HTML code
    source="https://www.hmhco.com/event",
    config=graph_config
)

result = smart_scraper_graph.run()
print(result)

Hi, please make a trial with repo_id = "mistralai/Mistral-7B-Instruct-v0.2"

Thanks it worked for me!

I am getting this error also. This is my code:

import os
import json
from typing import List
from pydantic import BaseModel, Field
from dotenv import load_dotenv
from langchain_openai import AzureChatOpenAI
from langchain_openai import AzureOpenAIEmbeddings
from scrapegraphai.graphs import SmartScraperGraph

load_dotenv()

# Define the output schema for the graph
class FAQLink(BaseModel):
    text: str = Field(description="The text of the link")
    url: str = Field(description="The URL of the link")

class FAQCategory(BaseModel):
    header: str = Field(description="The header of the FAQ category")
    links: List[FAQLink] = Field(description="The list of links in this category")

class FAQStructure(BaseModel):
    categories: List[FAQCategory] = Field(description="The list of FAQ categories")

# Initialize the model instances
llm_model_instance = AzureChatOpenAI(
    openai_api_key = os.environ["AZURE_OPENAI_API_KEY"],
    azure_endpoint = os.environ["AZURE_OPENAI_ENDPOINT"],
    openai_api_version = os.environ["AZURE_OPENAI_API_VERSION"],
    azure_deployment = os.environ["AZURE_OPENAI_CHAT_DEPLOYMENT_NAME"],
)

embedder_model_instance = AzureOpenAIEmbeddings(
    openai_api_key = os.environ["AZURE_OPENAI_API_KEY"],
    azure_endpoint = os.environ["AZURE_OPENAI_ENDPOINT"],
    openai_api_version = os.environ["AZURE_OPENAI_API_VERSION"],
    azure_deployment = os.environ["AZURE_OPENAI_EMBEDDINGS_DEPLOYMENT_NAME"],
)

graph_config = {
    "llm": {"model_instance": llm_model_instance},
    "embeddings": {"model_instance": embedder_model_instance}
}

# Create the SmartScraperGraph instance and run it
smart_scraper_graph = SmartScraperGraph(
    prompt="Extract all FAQ categories, their headers, and the links (text and URL) within each category from the CIMB bank FAQ page",
    source="https://www.cimb.com.my/en/personal/help-support/faq.html",
    schema=FAQStructure,
    config=graph_config
)

result = smart_scraper_graph.run()
print(json.dumps(result, indent=4))

Same error for me, and i'm using "mistralai/Mistral-7B-Instruct-v0.2":

Here's my code:
`from bs4 import BeautifulSoup
import requests
import json
import time

urls = [
"https://news.google.com/topics/CAAqJQgKIh9DQkFTRVFvSUwyMHZNRFZ5TkhjU0JYQjBMVkJVS0FBUAE?hl=pt-PT&gl=PT&ceid=PT%3Apt-150",
#"https://news.google.com/topics/CAAqKggKIiRDQkFTRlFvSUwyMHZNRGx1YlY4U0JYQjBMVkJVR2dKUVZDZ0FQAQ?hl=pt-PT&gl=PT&ceid=PT%3Apt-150",
#"https://news.google.com/topics/CAAqKggKIiRDQkFTRlFvSUwyMHZNRGx6TVdZU0JYQjBMVkJVR2dKUVZDZ0FQAQ?hl=pt-PT&gl=PT&ceid=PT%3Apt-150",
#"https://news.google.com/topics/CAAqLAgKIiZDQkFTRmdvSkwyMHZNR1ptZHpWbUVnVndkQzFRVkJvQ1VGUW9BQVAB?hl=pt-PT&gl=PT&ceid=PT%3Apt-150",
#"https://news.google.com/topics/CAAqKggKIiRDQkFTRlFvSUwyMHZNREpxYW5RU0JYQjBMVkJVR2dKUVZDZ0FQAQ?hl=pt-PT&gl=PT&ceid=PT%3Apt-150",
#"https://news.google.com/topics/CAAqKggKIiRDQkFTRlFvSUwyMHZNRFp1ZEdvU0JYQjBMVkJVR2dKUVZDZ0FQAQ?hl=pt-PT&gl=PT&ceid=PT%3Apt-150",
"https://news.google.com/topics/CAAqJQgKIh9DQkFTRVFvSUwyMHZNR3QwTlRFU0JYQjBMVkJVS0FBUAE?hl=pt-PT&gl=PT&ceid=PT%3Apt-150"
]

def fetch_and_parse(url):
response = requests.get(url)
if response.status_code == 200:
time.sleep(5)
soup = BeautifulSoup(response.content, 'html.parser')
return soup
else:
print(f"Failed to fetch HTML. Status code: {response.status_code}")
return None

full_urls = []

for url in urls:
soup = fetch_and_parse(url)
if soup:
target_tag = soup.find('c-wiz', class_='D9SJMe')
if target_tag:
news_articles = target_tag.find_all('article')
for article in news_articles:
article_link_tag = article.find('a', class_='JtKRv')
if article_link_tag:
article_url = article_link_tag['href'].strip('.')
base_url = "https://news.google.com"
full_url = base_url + article_url
full_urls.append(full_url)
else:
print("Target tag not found.")

Save all URLs to the file outside the loop

with open('urls.txt', 'w') as f:
for url in full_urls:
f.write(url + '\n')

from langchain_huggingface import HuggingFaceEndpoint
from langchain_community.embeddings import HuggingFaceInferenceAPIEmbeddings
from scrapegraphai.graphs import SmartScraperGraph
from google.colab import userdata

llm_model_instance = HuggingFaceEndpoint(
repo_id = "mistralai/Mistral-7B-Instruct-v0.2",
max_new_tokens = 512,
temperature = 0.1,
huggingfacehub_api_token = userdata.get("HUGGINGFACEHUB_API_TOKEN")
)

embedder_model_instance = HuggingFaceInferenceAPIEmbeddings(
api_key = userdata.get("HUGGINGFACEHUB_API_TOKEN"),
model_name = "sentence-transformers/all-MiniLM-l6-v2"
)

graph_config = {
"llm": {
"model_instance": llm_model_instance
},
"embeddings": {
"model_instance": embedder_model_instance
}
}

def extract_content(url):
try:
content = SmartScraperGraph(
prompt = "Find and extract the text content from the following url:",
source = url,
config = graph_config
)
result = content.run()

    if isinstance(result, dict):
        json_result = json.dumps(result, ensure_ascii=False)
        return json_result
    else:
        raise ValueError("Invalid JSON format")
except Exception as e:
    print(f"Error processing URL '{url}': {e}")
    return None

extracted_content = []

for source in full_urls:
result = extract_content(source)
if result is not None:
extracted_content.append(result)

with open("extracted_content.txt", "w", encoding='utf-8') as f:
for content in extracted_content:
f.write(content + "\n\n")
`

Any solutions to the problem? I'm using OpenAI Azure

hi, can you provide me through email an azure key just for making a trial?

I'm afraid I can't do that 😞
As the API is not a product of mine, it belongs to a company.

how we can test it? we are an open source org

I understand your point and I appreciate the great effort, truly.
Though I thought the testing examples are already available since the option of choosing Azure is already offered therefore fully tested (and even the code for the implementation is also available on the package's documentation) 😊
Anyways, whenever I find time, I'll try to find the issue and hopefully debug it. Thanks!