'SmartScraperGraph' object has no attribute 'model_token'

Question

'SmartScraperGraph' object has no attribute 'model_token'

Naman-Bhrgv opened this issue 3 months ago · comments

Describe the bug
Hi,
I am trying to scrape webpage using SmartScraperGraph, but am constantly getting the following error-

To Reproduce

This is the code-

repo_id = "meta-llama/Llama-2-7b-hf"

llm_model_instance = HuggingFaceEndpoint(
    repo_id=repo_id,  max_length=128, temperature=0.5, huggingfacehub_api_token='MY_API_TOKEN')


embedder_model_instance = HuggingFaceInferenceAPIEmbeddings(
    api_key='MY_API_TOKEN', model_name="sentence-transformers/all-MiniLM-l6-v2"
)

from scrapegraphai.graphs import SmartScraperGraph

graph_config = {
    "llm": {"model_instance": llm_model_instance},
    "embeddings": {"model_instance": embedder_model_instance}
}


smart_scraper_graph = SmartScraperGraph(
    prompt="List me all the events, with the following fields: company_name, event_name, event_start_date, event_start_time, event_end_date, event_end_time, location, event_mode, event_category, third_party_redirect, no_of_days, time_in_hours, hosted_or_attending, refreshments_type,  registration_available, registration_link",
    # also accepts a string with the already downloaded HTML code
    source="https://www.hmhco.com/event",
    config=graph_config
)

result = smart_scraper_graph.run()
print(result)

Marco Vinciguerra · Answer 1 · Sat Jun 15 2024 14:42:28 GMT+0800 (China Standard Time)

Hi, please make a trial with repo_id = "mistralai/Mistral-7B-Instruct-v0.2"

Naman-Bhrgv · Answer 2 · Sat Jun 15 2024 14:55:54 GMT+0800 (China Standard Time)

Thanks it worked for me!

mingjun1120 · Answer 3 · Wed Jul 03 2024 17:59:28 GMT+0800 (China Standard Time)

I am getting this error also. This is my code:

import os
import json
from typing import List
from pydantic import BaseModel, Field
from dotenv import load_dotenv
from langchain_openai import AzureChatOpenAI
from langchain_openai import AzureOpenAIEmbeddings
from scrapegraphai.graphs import SmartScraperGraph

load_dotenv()

# Define the output schema for the graph
class FAQLink(BaseModel):
    text: str = Field(description="The text of the link")
    url: str = Field(description="The URL of the link")

class FAQCategory(BaseModel):
    header: str = Field(description="The header of the FAQ category")
    links: List[FAQLink] = Field(description="The list of links in this category")

class FAQStructure(BaseModel):
    categories: List[FAQCategory] = Field(description="The list of FAQ categories")

# Initialize the model instances
llm_model_instance = AzureChatOpenAI(
    openai_api_key = os.environ["AZURE_OPENAI_API_KEY"],
    azure_endpoint = os.environ["AZURE_OPENAI_ENDPOINT"],
    openai_api_version = os.environ["AZURE_OPENAI_API_VERSION"],
    azure_deployment = os.environ["AZURE_OPENAI_CHAT_DEPLOYMENT_NAME"],
)

embedder_model_instance = AzureOpenAIEmbeddings(
    openai_api_key = os.environ["AZURE_OPENAI_API_KEY"],
    azure_endpoint = os.environ["AZURE_OPENAI_ENDPOINT"],
    openai_api_version = os.environ["AZURE_OPENAI_API_VERSION"],
    azure_deployment = os.environ["AZURE_OPENAI_EMBEDDINGS_DEPLOYMENT_NAME"],
)

graph_config = {
    "llm": {"model_instance": llm_model_instance},
    "embeddings": {"model_instance": embedder_model_instance}
}

# Create the SmartScraperGraph instance and run it
smart_scraper_graph = SmartScraperGraph(
    prompt="Extract all FAQ categories, their headers, and the links (text and URL) within each category from the CIMB bank FAQ page",
    source="https://www.cimb.com.my/en/personal/help-support/faq.html",
    schema=FAQStructure,
    config=graph_config
)

result = smart_scraper_graph.run()
print(json.dumps(result, indent=4))

nelohenriq · Answer 4 · Sat Jul 06 2024 22:24:21 GMT+0800 (China Standard Time)

Same error for me, and i'm using "mistralai/Mistral-7B-Instruct-v0.2":

Here's my code:
`from bs4 import BeautifulSoup
import requests
import json
import time

urls = [
"https://news.google.com/topics/CAAqJQgKIh9DQkFTRVFvSUwyMHZNRFZ5TkhjU0JYQjBMVkJVS0FBUAE?hl=pt-PT&gl=PT&ceid=PT%3Apt-150",
#"https://news.google.com/topics/CAAqKggKIiRDQkFTRlFvSUwyMHZNRGx1YlY4U0JYQjBMVkJVR2dKUVZDZ0FQAQ?hl=pt-PT&gl=PT&ceid=PT%3Apt-150",
#"https://news.google.com/topics/CAAqKggKIiRDQkFTRlFvSUwyMHZNRGx6TVdZU0JYQjBMVkJVR2dKUVZDZ0FQAQ?hl=pt-PT&gl=PT&ceid=PT%3Apt-150",
#"https://news.google.com/topics/CAAqLAgKIiZDQkFTRmdvSkwyMHZNR1ptZHpWbUVnVndkQzFRVkJvQ1VGUW9BQVAB?hl=pt-PT&gl=PT&ceid=PT%3Apt-150",
#"https://news.google.com/topics/CAAqKggKIiRDQkFTRlFvSUwyMHZNREpxYW5RU0JYQjBMVkJVR2dKUVZDZ0FQAQ?hl=pt-PT&gl=PT&ceid=PT%3Apt-150",
#"https://news.google.com/topics/CAAqKggKIiRDQkFTRlFvSUwyMHZNRFp1ZEdvU0JYQjBMVkJVR2dKUVZDZ0FQAQ?hl=pt-PT&gl=PT&ceid=PT%3Apt-150",
"https://news.google.com/topics/CAAqJQgKIh9DQkFTRVFvSUwyMHZNR3QwTlRFU0JYQjBMVkJVS0FBUAE?hl=pt-PT&gl=PT&ceid=PT%3Apt-150"
]

def fetch_and_parse(url):
response = requests.get(url)
if response.status_code == 200:
time.sleep(5)
soup = BeautifulSoup(response.content, 'html.parser')
return soup
else:
print(f"Failed to fetch HTML. Status code: {response.status_code}")
return None

full_urls = []

for url in urls:
soup = fetch_and_parse(url)
if soup:
target_tag = soup.find('c-wiz', class_='D9SJMe')
if target_tag:
news_articles = target_tag.find_all('article')
for article in news_articles:
article_link_tag = article.find('a', class_='JtKRv')
if article_link_tag:
article_url = article_link_tag['href'].strip('.')
base_url = "https://news.google.com"
full_url = base_url + article_url
full_urls.append(full_url)
else:
print("Target tag not found.")

Save all URLs to the file outside the loop

with open('urls.txt', 'w') as f:
for url in full_urls:
f.write(url + '\n')

from langchain_huggingface import HuggingFaceEndpoint
from langchain_community.embeddings import HuggingFaceInferenceAPIEmbeddings
from scrapegraphai.graphs import SmartScraperGraph
from google.colab import userdata

llm_model_instance = HuggingFaceEndpoint(
repo_id = "mistralai/Mistral-7B-Instruct-v0.2",
max_new_tokens = 512,
temperature = 0.1,
huggingfacehub_api_token = userdata.get("HUGGINGFACEHUB_API_TOKEN")
)

embedder_model_instance = HuggingFaceInferenceAPIEmbeddings(
api_key = userdata.get("HUGGINGFACEHUB_API_TOKEN"),
model_name = "sentence-transformers/all-MiniLM-l6-v2"
)

graph_config = {
"llm": {
"model_instance": llm_model_instance
},
"embeddings": {
"model_instance": embedder_model_instance
}
}

def extract_content(url):
try:
content = SmartScraperGraph(
prompt = "Find and extract the text content from the following url:",
source = url,
config = graph_config
)
result = content.run()

    if isinstance(result, dict):
        json_result = json.dumps(result, ensure_ascii=False)
        return json_result
    else:
        raise ValueError("Invalid JSON format")
except Exception as e:
    print(f"Error processing URL '{url}': {e}")
    return None

extracted_content = []

for source in full_urls:
result = extract_content(source)
if result is not None:
extracted_content.append(result)

with open("extracted_content.txt", "w", encoding='utf-8') as f:
for content in extracted_content:
f.write(content + "\n\n")
`

Aymen · Answer 5 · Thu Jul 11 2024 23:15:07 GMT+0800 (China Standard Time)

Any solutions to the problem? I'm using OpenAI Azure

Marco Vinciguerra · Answer 6 · Fri Jul 12 2024 00:31:02 GMT+0800 (China Standard Time)

hi, can you provide me through email an azure key just for making a trial?

Aymen · Answer 7 · Fri Jul 12 2024 00:34:10 GMT+0800 (China Standard Time)

I'm afraid I can't do that 😞
As the API is not a product of mine, it belongs to a company.

Marco Vinciguerra · Answer 8 · Fri Jul 12 2024 00:35:45 GMT+0800 (China Standard Time)

how we can test it? we are an open source org

Aymen · Answer 9 · Fri Jul 12 2024 00:46:00 GMT+0800 (China Standard Time)

I understand your point and I appreciate the great effort, truly.
Though I thought the testing examples are already available since the option of choosing Azure is already offered therefore fully tested (and even the code for the implementation is also available on the package's documentation) 😊
Anyways, whenever I find time, I'll try to find the issue and hopefully debug it. Thanks!