Browse Source

scrapper is meh.

tags/v0.4.0
roxie 6 years ago
parent
commit
b0dc5a81f6
3 changed files with 29 additions and 20 deletions
  1. +9
    -3
      libs/scrapper/scrapper.py
  2. +7
    -11
      libs/scrapper/scrappersites/gfy.py
  3. +13
    -6
      libs/scrapper/scrappersites/imgur.py

+ 9
- 3
libs/scrapper/scrapper.py View File

import requests import requests
import random
from libs.scrapper.scrappersites import imgur, reddit, gfy, tumblr from libs.scrapper.scrappersites import imgur, reddit, gfy, tumblr


class scrapper(): class scrapper():
def __init__(self): def __init__(self):
pass pass


def linkget(self, subreddit):
html = requests.get("https://reddit.com/r/"+subreddit+".json", headers = {'User-agent': 'RoxBot Discord Bot'})
def linkget(self, subreddit, israndom):
if israndom:
options = [".json?count=100", "/top/.json?sort=top&t=all&count=100"]
choice = random.choice(options)
subreddit += choice
html = requests.get("https://reddit.com/r/"+subreddit, headers = {'User-agent': 'RoxBot Discord Bot'})
reddit = html.json()["data"]["children"] reddit = html.json()["data"]["children"]
return reddit return reddit


if "imgur" in url: if "imgur" in url:
url2 = imgur.imgur().get(url) url2 = imgur.imgur().get(url)
elif "gfycat" in url: elif "gfycat" in url:
url2 = gfy.gfycat().get(str(url))
url2 = gfy.gfycat().get(url)
elif "eroshare" in url: elif "eroshare" in url:
#eroshare.eroshare().get(url) #eroshare.eroshare().get(url)
pass pass
url2 = reddit.reddit().get(url) url2 = reddit.reddit().get(url)
elif "media.tumblr" in url: elif "media.tumblr" in url:
url2 = tumblr.tumblr().get(url) url2 = tumblr.tumblr().get(url)
print(url)
return url2 return url2

+ 7
- 11
libs/scrapper/scrappersites/gfy.py View File

def __init__(self): def __init__(self):
pass pass


def url_get(self,url,urladd):
def url_get(self,url):
urlsplit = url.split("/") urlsplit = url.split("/")
urlsplit[2] = urladd + urlsplit[2]
urlsplit.append(".webm")
i = 0
urlnew = ""
for split in urlsplit:
urlnew = urlnew + split
i += 1
if i <= 3:
urlnew = urlnew + "/"
urlsplit[2] = "giant." + urlsplit[2]
urlsplit[-1] += ".gif"
urlnew = "/".join(urlsplit)
return urlnew return urlnew


def get(self,url): def get(self,url):
return url
#url2 = self.url_get(url)
url2 = url
return url2

+ 13
- 6
libs/scrapper/scrappersites/imgur.py View File

def removed(self,url): def removed(self,url):
page = requests.get(url) page = requests.get(url)
soup = BeautifulSoup(page.content, 'html.parser') soup = BeautifulSoup(page.content, 'html.parser')
if "removed.png" in soup.a["src"]:
if "removed.png" in soup.img["src"]:
return True return True
else: else:
return False return False


def get(self, url): def get(self, url):
if self.removed(url):
return False
if url.split(".")[-1] in ("png", "jpg", "jpeg", "gif", "gifv"): if url.split(".")[-1] in ("png", "jpg", "jpeg", "gif", "gifv"):
return url return url
elif url.split("/")[-2] == "a":
#elif url.split(".")[-1] == "gifv":
# urlsplit = url.split(".")
# urlsplit[-1] = "gif"
# url = ".".join(urlsplit)
# return url"""
else:
if self.removed(url):
return False
page = requests.get(url) page = requests.get(url)
soup = BeautifulSoup(page.content, 'html.parser') soup = BeautifulSoup(page.content, 'html.parser')
links = [] links = []
if not img["src"] in links: if not img["src"] in links:
links.append(img["src"]) links.append(img["src"])
if len(links) > 1: if len(links) > 1:
return False
return url
else: else:
print(links)
if not "http" in links[0]:
links[0] = "https:" + links[0]
return links[0] return links[0]

Loading…
Cancel
Save