Forum Scraper - As an AI...

import requests
from bs4 import BeautifulSoup
import random
import re

def find_post_urls(url):
headers = {
‘User-Agent’: ‘Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3’}
response = requests.get(url, headers=headers)

soup = BeautifulSoup(response.text, ‘html.parser’)
#print(response.text)

post_urls = []

# Find all the <a> tags with class “forum-title”
forum_title_tags = soup.find_all(‘a’, class_=’forum-title’)

for tag in forum_title_tags:
post_url = tag[‘href’]
post_urls.append(post_url)

return post_urls

def find_all_links(url):
headers = {
‘User-Agent’: ‘Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3’
}
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.content, ‘html.parser’)

links = []

# Find all ‘a’ tags in the main content
a_tags = soup.find_all(‘a’)

for tag in a_tags:
link = tag.get(‘href’)
# Ensure the link is not None and contains the specified stem
if link and ‘http://www.asanai.net/forum/topic’ in link:
links.append(link)
print(link)

return links

def parse_conversation(url):
headers = {
‘User-Agent’: ‘Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3’
}
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.content, ‘html.parser’)
conversation = []

# Assuming the title is contained within an ‘h1’ tag
title_tag = soup.find(‘h1′, class_=’main-title main-title-topic’)
title = title_tag.text if title_tag else “No title found”
#print(f”THREAD TITLE {title}”)
conversation.append(“THREAD TITLE “+ title)

def is_comment(tag):
return tag.name == ‘div’ and ((tag.get(‘class’) == [‘post-element’] or tag.get(‘class’) == [‘post-message’] or tag.get(‘class’) == [‘post-author-block-name’]))

comment_tags = soup.find_all(is_comment)

for comment_tag in comment_tags:
#print(“USER: “)
username_tags = comment_tag.find_all(‘a’)
for username_tag in username_tags :
#print(username_tag.text)
conversation.append(username_tag.text)

# Assuming the comment text is within a ‘p’ tag
comment_text_tags = comment_tag.find_all(‘p’)
for comment_text_tag in comment_text_tags :
comment_text = comment_text_tag.text
if conversation.__contains__(comment_text) :
continue
#print(f”\t- {comment_text}”)
conversation.append(comment_text)

return conversation

def find_title(url):
headers = {
‘User-Agent’: ‘Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3’
}
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.content, ‘html.parser’)

def main():
forum_url = “http://www.asanai.net/forum/forum/first-forum/”
subforum_urls = find_post_urls(forum_url)
#print(subforum_urls)

all_threads = []
for subforum_url in subforum_urls :
links = find_all_links(subforum_url)
for link in links:
all_threads.append(link)

random_thread = random.choice(all_threads)

print(“\n\nRANDOM THREAD”)
print(random_thread)
print(parse_conversation(random_thread))
print(“=================================”)

print(“\n\nTARGETED THREAD”)
steering = “http://www.asanai.net/forum/topic/steering-ais-future-whats-ethics-got-to-do-with-it/”
print(steering)
convo = parse_conversation(steering)
print(convo)
for element in convo :
print(element)
print(“=================================”)

keyword = “howdy”
keyword = keyword.lower()

print(“\n\nTHREAD(S) FOUND BY KEYWORD IN TITLE (” + keyword + “)”)
for thread in all_threads :
if thread.__contains__(“postid”) :
continue
title = find_title(thread)
if title is None:
continue

if title.lower().__contains__(keyword) :
print(“\n\n” + thread)
print(parse_conversation(thread))
print(“=================================”)

if __name__ == “__main__”:
main()