Browse Source

scrapper is meh.

tags/v0.4.0
roxie 6 years ago
parent
commit
b0dc5a81f6
3 changed files with 29 additions and 20 deletions
  1. +9
    -3
      libs/scrapper/scrapper.py
  2. +7
    -11
      libs/scrapper/scrappersites/gfy.py
  3. +13
    -6
      libs/scrapper/scrappersites/imgur.py

+ 9
- 3
libs/scrapper/scrapper.py View File

@@ -1,12 +1,17 @@
import requests
import random
from libs.scrapper.scrappersites import imgur, reddit, gfy, tumblr

class scrapper():
def __init__(self):
pass

def linkget(self, subreddit):
html = requests.get("https://reddit.com/r/"+subreddit+".json", headers = {'User-agent': 'RoxBot Discord Bot'})
def linkget(self, subreddit, israndom):
if israndom:
options = [".json?count=100", "/top/.json?sort=top&t=all&count=100"]
choice = random.choice(options)
subreddit += choice
html = requests.get("https://reddit.com/r/"+subreddit, headers = {'User-agent': 'RoxBot Discord Bot'})
reddit = html.json()["data"]["children"]
return reddit

@@ -15,7 +20,7 @@ class scrapper():
if "imgur" in url:
url2 = imgur.imgur().get(url)
elif "gfycat" in url:
url2 = gfy.gfycat().get(str(url))
url2 = gfy.gfycat().get(url)
elif "eroshare" in url:
#eroshare.eroshare().get(url)
pass
@@ -23,4 +28,5 @@ class scrapper():
url2 = reddit.reddit().get(url)
elif "media.tumblr" in url:
url2 = tumblr.tumblr().get(url)
print(url)
return url2

+ 7
- 11
libs/scrapper/scrappersites/gfy.py View File

@@ -2,18 +2,14 @@ class gfycat():
def __init__(self):
pass

def url_get(self,url,urladd):
def url_get(self,url):
urlsplit = url.split("/")
urlsplit[2] = urladd + urlsplit[2]
urlsplit.append(".webm")
i = 0
urlnew = ""
for split in urlsplit:
urlnew = urlnew + split
i += 1
if i <= 3:
urlnew = urlnew + "/"
urlsplit[2] = "giant." + urlsplit[2]
urlsplit[-1] += ".gif"
urlnew = "/".join(urlsplit)
return urlnew

def get(self,url):
return url
#url2 = self.url_get(url)
url2 = url
return url2

+ 13
- 6
libs/scrapper/scrappersites/imgur.py View File

@@ -9,18 +9,22 @@ class imgur():
def removed(self,url):
page = requests.get(url)
soup = BeautifulSoup(page.content, 'html.parser')
if "removed.png" in soup.a["src"]:
if "removed.png" in soup.img["src"]:
return True
else:
return False

def get(self, url):
if self.removed(url):
return False
if url.split(".")[-1] in ("png", "jpg", "jpeg", "gif", "gifv"):
return url
elif url.split("/")[-2] == "a":
#elif url.split(".")[-1] == "gifv":
# urlsplit = url.split(".")
# urlsplit[-1] = "gif"
# url = ".".join(urlsplit)
# return url"""
else:
if self.removed(url):
return False
page = requests.get(url)
soup = BeautifulSoup(page.content, 'html.parser')
links = []
@@ -29,6 +33,9 @@ class imgur():
if not img["src"] in links:
links.append(img["src"])
if len(links) > 1:
return False
return url
else:
print(links)
if not "http" in links[0]:
links[0] = "https:" + links[0]
return links[0]

Loading…
Cancel
Save