From 1b3b031e4b15f947c539ae76fc892874de03c4be Mon Sep 17 00:00:00 2001 From: Matt Lyon Date: Wed, 30 Oct 2019 15:27:45 +1100 Subject: initial commit --- tpblite/models/__init__.py | 0 tpblite/models/torrents.py | 103 +++++++++++++++++++++++++++++++++++++++ tpblite/models/utils.py | 118 +++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 221 insertions(+) create mode 100644 tpblite/models/__init__.py create mode 100644 tpblite/models/torrents.py create mode 100644 tpblite/models/utils.py (limited to 'tpblite/models') diff --git a/tpblite/models/__init__.py b/tpblite/models/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tpblite/models/torrents.py b/tpblite/models/torrents.py new file mode 100644 index 0000000..8567280 --- /dev/null +++ b/tpblite/models/torrents.py @@ -0,0 +1,103 @@ +import re +from bs4 import BeautifulSoup + + +def fileSizeStrToInt(size_str): + '''Converts file size given in *iB format to bytes integer''' + + unit_dict = {'KiB':(2**10), + 'MiB':(2**20), + 'GiB':(2**30), + 'TiB':(2**40)} + try: + num = float(size_str[:-3]) + unit = size_str[-3:] + return int(num * unit_dict[unit]) + except Exception as e: + raise AttributeError('Cannot determine filesize: {0}, error: {1}'.format(size_str,e)) + +class Torrent(object): + ''' + Abstract class to contain info about torrent + magnet link, file size, number of seeds, number of leeches etc. + ''' + def __init__(self, html_row): + self.html_row = html_row + self.title = self._getTitle() + self.seeds, self.leeches = self._getPeers() + self.uploaded, self.filesize, self.byte_size, self.uploader = self._getFileInfo() + self.filesize_int = fileSizeStrToInt(self.filesize) + self.magnetlink = self._getMagnetLink() + + def __str__(self): + return '{0}, S: {1}, L: {2}, {3}'.format(self.title, + self.seeds, + self.leeches, + self.filesize) + + def _getTitle(self): + return self.html_row.find('a', class_='detLink').string + + def _getPeers(self): + taglist = self.html_row.find_all('td', align='right') + return int(taglist[0].string), int(taglist[1].string) + + def _getFileInfo(self): + text = self.html_row.find('font', class_='detDesc').get_text() + t = text.split(',') + uptime = t[0].replace('Uploaded ','') + size = t[1].replace('Size ', '') + byte_size = fileSizeStrToInt(size) + uploader = t[2].replace('ULed by ', '').strip() + return uptime, size, byte_size, uploader + + def _getMagnetLink(self): + tag = self.html_row.find('a', href=(re.compile('magnet'))) + link = tag.get('href') + return link + +class Torrents(object): + ''' + Torrent object, takes query response and parses into + torrent list or dict. Has methods to select items from + torrent list. + ''' + def __init__(self, webpage): + self.webpage = webpage + self.list = self._createTorrentList() + + def __str__(self): + return 'Torrents Object: {0} torrents'.format(len(self.list)) + + def __iter__(self): + return iter(self.list) + + def _createTorrentList(self): + soup = BeautifulSoup(self.webpage, features='html.parser') + rows = self.__getRows(soup) + torrents = [] + for row in rows: + torrents.append(Torrent(row)) + return torrents + + def __getRows(self, soup): + rows = soup.body.find_all('tr') + # remove first and last entries + del rows[0] + del rows[-1] + return rows + + def getBestTorrent(self, min_seeds=30, min_filesize='1 GiB', max_filesize='4 GiB'): + if not type(min_filesize) == 'int': + min_filesize = fileSizeStrToInt(min_filesize) + if not type(max_filesize) == 'int': + max_filesize = fileSizeStrToInt(max_filesize) + filtered_list = filter(lambda x: self._filterTorrent(x, min_seeds, min_filesize, max_filesize), self.list) + sorted_list = sorted(filtered_list, key=lambda x: x.seeds, reverse=True) + return sorted_list[0] + + def _filterTorrent(self, torrent, min_seeds, min_filesize, max_filesize): + if (torrent.seeds < min_seeds) or (torrent.filesize_int < min_filesize) or (torrent.filesize_int > max_filesize): + return False + else: + return True \ No newline at end of file diff --git a/tpblite/models/utils.py b/tpblite/models/utils.py new file mode 100644 index 0000000..4aebc0d --- /dev/null +++ b/tpblite/models/utils.py @@ -0,0 +1,118 @@ +import random +from urllib.request import Request, urlopen + +# Delete these when finished rewriting URL +from collections import OrderedDict +from purl import URL as PURL +# ============================== + +class Query(object): + ''' + Query object capable of getting html response given + a search query and other parameters. + ''' + def __init__(self, query, base_url='https://tpb.party', page=0, order=99, category=0): + self.base_url = base_url + self.base_path = '/search' + self.url = URL(base_url, self.base_path, + segments=['query', 'page', 'order', 'category'], + defaults=[query, str(page), str(order), str(category)], + ) + self.webpage = self._sendRequest() + + def _sendRequest(self): + req = Request(self.url, headers=headers()) + return urlopen(req).read() + + +### REWRITE THEN DELETE THESE + +def URL(base, path, segments=None, defaults=None): + """ + URL segment handler capable of getting and setting segments by name. The + URL is constructed by joining base, path and segments. + + For each segment a property capable of getting and setting that segment is + created dynamically. + """ + # Make a copy of the Segments class + url_class = type(Segments.__name__, Segments.__bases__, + dict(Segments.__dict__)) + segments = [] if segments is None else segments + defaults = [] if defaults is None else defaults + # For each segment attach a property capable of getting and setting it + for segment in segments: + setattr(url_class, segment, url_class._segment(segment)) + # Instantiate the class with the actual parameters + return url_class(base, path, segments, defaults) + + +class Segments(object): + + """ + URL segment handler, not intended for direct use. The URL is constructed by + joining base, path and segments. + """ + + def __init__(self, base, path, segments, defaults): + # Preserve the base URL + self.base = PURL(base, path=path) + # Map the segments and defaults lists to an ordered dict + self.segments = OrderedDict(zip(segments, defaults)) + + def build(self): + # Join base segments and segments + segments = self.base.path_segments() + tuple(self.segments.values()) + # Create a new URL with the segments replaced + url = self.base.path_segments(segments) + return url + + def __str__(self): + return self.build().as_string() + + def _get_segment(self, segment): + return self.segments[segment] + + def _set_segment(self, segment, value): + self.segments[segment] = value + + @classmethod + def _segment(cls, segment): + """ + Returns a property capable of setting and getting a segment. + """ + return property( + fget=lambda x: cls._get_segment(x, segment), + fset=lambda x, v: cls._set_segment(x, segment, v), + ) + + +def headers(): + """ + The Pirate Bay blocks requests (403 Forbidden) + basing on User-Agent header, so it's probably better to rotate them. + User-Agents taken from: + https://techblog.willshouse.com/2012/01/03/most-common-user-agents/ + """ + return { + "User-Agent": random.choice(USER_AGENTS), + "origin_req_host": "thepiratebay.se", + } + + +USER_AGENTS = ( + 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) ' + 'AppleWebKit/537.36 (KHTML, like Gecko) ' + 'Chrome/60.0.3112.113 Safari/537.36', + 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) ' + 'AppleWebKit/537.36 (KHTML, like Gecko) ' + 'Chrome/60.0.3112.101 Safari/537.36', + 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) ' + 'AppleWebKit/537.36 (KHTML, like Gecko) ' + 'Chrome/60.0.3112.113 Safari/537.36', + 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) ' + 'AppleWebKit/537.36 (KHTML, like Gecko) ' + 'Chrome/60.0.3112.113 Safari/537.36', +) + +### ==================== \ No newline at end of file -- cgit v1.2.3