0xkiichiro / scaper

Geek Repo:Geek Repo

Github PK Tool:Github PK Tool

Twitter Scraper

This tool allows you to scrape your preferred youtube videos & tweets

Usage

To use download this repository to your local, create a venv, install the requirements

pip install -r requirements.txt

& simply type the following command into your terminal:

Youtube: python scrape_youtube.py <channel_name>

Twitter: python scrape_twitter.py <twitter_handle>

Here's an example command for scraping Joe Rogan's videos:

Youtube: python scrape_youtube.py joerogan

Twitter: python scrape_twitter.py joerogan

Currently, we support scraping the following attributes:

Youtube

  • Video name
  • Video views
  • Video link
  • Video creation date
  • Video scraping date

Twitter

  • owner name
  • owner handle
  • tweet context
  • number of comments
  • number of likes
  • number of retweets
  • number of impressions
  • tweet link
  • tweeted at

Code

The code for this tool uses the following libraries:

  • selenium
  • pandas
  • time
  • typer
from selenium import webdriver
from selenium.webdriver.common.by import By
import time
import typer
from selenium.webdriver.common.keys import Keys
import sqlite3
from parse import convert_k_m_to_numeric

app = typer.Typer()

@app.command()
def scrape(twitter_handle: str):
    SCROLL_PAUSE_TIME = 4
    URL = f'https://twitter.com/{twitter_handle}'
    driver = webdriver.Chrome()
    driver.maximize_window()
    driver.get(URL)
    print('waiting..')
    time.sleep(SCROLL_PAUSE_TIME)
    context_list = []
    REACHED_PAGE_END = False
    conn = sqlite3.connect('tweets.sqlite')
    cursor = conn.cursor()

    while True:
        try:
            # If notifications modal open, close it
            notifications_modal = driver.find_element(By.CSS_SELECTOR, '[data-testid="sheetDialog"]')
            clickable = notifications_modal.find_element(By.CSS_SELECTOR, '[role="button"]')
            clickable.click()
        except:
            pass

        current_time = time.localtime()
        formatted_time = f"{current_time.tm_year}.{current_time.tm_mon}.{current_time.tm_mday} {current_time.tm_hour}:{current_time.tm_min}"

        owner_name = driver.find_element(By.XPATH, '//*[@id="react-root"]/div/div/div[2]/main/div/div/div/div/div/div/div/div/div/div/div[2]/div[1]/div/div[1]/div/div/span/span[1]').text
        owner_handle = driver.find_element(By.XPATH, '//*[@id="react-root"]/div/div/div[2]/main/div/div/div/div/div/div/div/div/div/div/div[2]/div[1]/div/div[2]/div/div/div/span').text
        tweets = driver.find_elements(By.CSS_SELECTOR, f'[data-testid="tweet"]')
        last_height = driver.execute_script("return document.documentElement.scrollHeight")
        pass_counter = 0

        for tweet in tweets:
            try:
                # check for video
                video = str(tweet.find_elements(By.TAG_NAME, 'video'))
                if video:
                    has_media = True
            except:
                has_media = False
                pass
            try:
                # check for image
                image = tweet.find_element(By.XPATH, './/img[@alt="Image"]')
                media_link = str(image.get_attribute('src'))
                has_media = True
            except:
                image = ''
                media_link = ''
                has_media = False

            is_quote = ''
            quote_source_user = ''
            quote_content = ''
            retweet_source_user = ''
            context = ''
            tweet_link = tweet.find_element(By.XPATH, './/div/div/div[2]/div[2]/div[1]/div/div[1]/div/div/div[2]/div/div[3]/a').get_attribute('href')
            try:
                is_retweet = bool(tweet.find_element(By.CSS_SELECTOR, 'span[data-testid="socialContext"]').text)
                if is_retweet:
                    retweet_source_user = tweet.find_element(By.XPATH, './/div/div/div[2]/div[2]/div[1]/div/div[1]/div/div/div[2]/div/div[1]/a/div/span').text
                    tweet_link = tweet.find_element(By.XPATH, './/div/div/div[2]/div[2]/div[1]/div[1]/div[1]/div/div/div[2]/div/div[3]/a').get_attribute('href')
            except:
                is_retweet = False
            try:
                context = tweet.find_element(By.CSS_SELECTOR, 'div[data-testid="tweetText"]').text
                tweeted_at = tweet.find_element(By.TAG_NAME, 'time').text
                nu_of_comments = convert_k_m_to_numeric(tweet.find_element(By.CSS_SELECTOR,'div[data-testid="reply"]').text)
                nu_of_likes = convert_k_m_to_numeric(tweet.find_element(By.CSS_SELECTOR,'div[data-testid="like"]').text)
                nu_of_retweets = convert_k_m_to_numeric(tweet.find_element(By.CSS_SELECTOR,'div[data-testid="retweet"]').text)
            except:
                pass_counter+=1
                pass
            try:
                tweet_impressions = convert_k_m_to_numeric(tweet.find_element(By.XPATH, './/div/div/div[2]/div[2]/div[4]/div/div[4]/a/div/div[2]/span/span/span').text)
            except:
                tweet_impressions = 0

            # Create DB columns
            columns = ['context', 'nu_of_comments', 'nu_of_likes', 'nu_of_retweets', 'tweet_impressions', 'owner_handle', 'owner_name', 'tweet_link', 'tweeted_at', 'created_at', 'is_retweet', 'retweet_source_user', 'quote_source_user', 'quote_content', 'is_quote', 'has_media' ,'media_link']
            tweet_obj = [context, nu_of_comments, nu_of_likes, nu_of_retweets, tweet_impressions, owner_handle, owner_name, tweet_link, tweeted_at, formatted_time, is_retweet, retweet_source_user, quote_source_user, quote_content, is_quote, has_media, media_link]

            if context not in context_list:
                cursor.execute(f'''
                    INSERT INTO scraped_tweets({', '.join(columns)}) VALUES(?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
                ''', tweet_obj)
                conn.commit()
                context_list.append(context)

        # Scroll down to bottom
        driver.find_element(By.TAG_NAME, 'body').send_keys(Keys.PAGE_DOWN)
        driver.find_element(By.TAG_NAME, 'body').send_keys(Keys.PAGE_DOWN)
        driver.find_element(By.TAG_NAME, 'body').send_keys(Keys.PAGE_DOWN)

        # Wait to load page
        print('still waiting..')
        time.sleep(SCROLL_PAUSE_TIME)
        new_height = driver.execute_script("return document.documentElement.scrollHeight")
        print('scroll down performed')

        # Check if we are end of the page
        if new_height == last_height:
            REACHED_PAGE_END = True
            print('reached to the end!')
        else:
            last_height = new_height
            print('keep going!')

        # Write to DB
        if REACHED_PAGE_END:
            conn.close()
            print(f'scrape completed! {len(context_list)} tweets are scraped, number of passed tweets are {pass_counter}.')
            break

if __name__ == '__main__':
    app()
    ```

## Conclusion

Happy scraping!

You can reach me for any questions & feature requests on:

Github: https://github.com/0xkiichiro |
Twitter: https://twitter.com/0xkiichiro

## License

This tool is released under the MIT License.

About


Languages

Language:Python 98.2%Language:Dockerfile 1.8%