From 4f00535cc83f57aac4b2a420907497e15f7c2f35 Mon Sep 17 00:00:00 2001 From: JPFrancoia Date: Tue, 24 Dec 2019 17:46:08 +0100 Subject: Adding a browse() method to the TPB object, to get torrents by category, without any query. Also ran Black + cleaned some docstrings + added some type annotations. --- tpblite/models/torrents.py | 135 ++++++++++++++++++++++----------------------- tpblite/models/utils.py | 73 +++++++++++++++--------- tpblite/tpblite.py | 70 ++++++++++++++++------- 3 files changed, 165 insertions(+), 113 deletions(-) diff --git a/tpblite/models/torrents.py b/tpblite/models/torrents.py index 2d8bc6a..ada282f 100644 --- a/tpblite/models/torrents.py +++ b/tpblite/models/torrents.py @@ -2,133 +2,132 @@ import re import unicodedata from bs4 import BeautifulSoup -#TODO: write better comments +# TODO: write better comments + def fileSizeStrToInt(size_str): - '''Converts file size given in *iB format to bytes integer''' - - unit_dict = {'KiB':(2**10), - 'MiB':(2**20), - 'GiB':(2**30), - 'TiB':(2**40)} + """Converts file size given in *iB format to bytes integer""" + + unit_dict = {"KiB": (2 ** 10), "MiB": (2 ** 20), "GiB": (2 ** 30), "TiB": (2 ** 40)} try: num = float(size_str[:-3]) unit = size_str[-3:] return int(num * unit_dict[unit]) except Exception as e: - raise AttributeError('Cannot determine filesize: {0}, error: {1}'.format(size_str,e)) - -class Torrent(object): - ''' + raise AttributeError( + "Cannot determine filesize: {0}, error: {1}".format(size_str, e) + ) + + +class Torrent: + """ Abstract class to contain info about torrent magnet link, file size, number of seeds, number of leeches etc. - ''' + """ + def __init__(self, html_row): self.html_row = html_row self.title = self._getTitle() self.seeds, self.leeches = self._getPeers() - self.upload_date, self.filesize, self.byte_size, self.uploader = self._getFileInfo() + self.upload_date, self.filesize, self.byte_size, self.uploader = ( + self._getFileInfo() + ) self.magnetlink = self._getMagnetLink() - + def __str__(self): - return '{0}, S: {1}, L: {2}, {3}'.format(self.title, - self.seeds, - self.leeches, - self.filesize) - + return "{0}, S: {1}, L: {2}, {3}".format( + self.title, self.seeds, self.leeches, self.filesize + ) + def __repr__(self): - return ''.format(self.title) + return "".format(self.title) def _getTitle(self): - return self.html_row.find('a', class_='detLink').string + return self.html_row.find("a", class_="detLink").string def _getMagnetLink(self): - tag = self.html_row.find('a', href=(re.compile('magnet'))) - link = tag.get('href') + tag = self.html_row.find("a", href=(re.compile("magnet"))) + link = tag.get("href") return link - + def _getPeers(self): - taglist = self.html_row.find_all('td', align='right') + taglist = self.html_row.find_all("td", align="right") return int(taglist[0].string), int(taglist[1].string) - + def _getFileInfo(self): - text = self.html_row.find('font', class_='detDesc').get_text() - t = text.split(',') - uptime = unicodedata.normalize('NFKD', t[0].replace('Uploaded ','').strip()) - size = unicodedata.normalize('NFKD', t[1].replace('Size ', '').strip()) + text = self.html_row.find("font", class_="detDesc").get_text() + t = text.split(",") + uptime = unicodedata.normalize("NFKD", t[0].replace("Uploaded ", "").strip()) + size = unicodedata.normalize("NFKD", t[1].replace("Size ", "").strip()) byte_size = fileSizeStrToInt(size) - uploader = unicodedata.normalize('NFKD', t[2].replace('ULed by ', '').strip()) + uploader = unicodedata.normalize("NFKD", t[2].replace("ULed by ", "").strip()) return uptime, size, byte_size, uploader - - -class Torrents(object): - ''' + + +class Torrents: + """ Torrent object, takes query response and parses into torrent list or dict. Has methods to select items from torrent list. - ''' - def __init__(self, search_str, html_source): - self.search_str = search_str - self.__search_set = None - + """ + + def __init__(self, html_source): self.html_source = html_source self.list = self._createTorrentList() - + def __str__(self): - return 'Torrents object: {} torrents'.format(len(self.list)) - + return "Torrents object: {} torrents".format(len(self.list)) + def __repr__(self): - return ''.format(len(self.list)) - + return "".format(len(self.list)) + def __iter__(self): return iter(self.list) def __len__(self): return len(self.list) - def __getitem__(self,index): + def __getitem__(self, index): return self.list[index] - @property - def _search_set(self): - if self.__search_set is None: - self.__search_set = set(filter(None, re.split(r'[\s.|\(|\)]',self.search_str.lower()))) - return self.__search_set - def _createTorrentList(self): - soup = BeautifulSoup(self.html_source, features='html.parser') + soup = BeautifulSoup(self.html_source, features="html.parser") if soup.body is None: - raise ConnectionError('Could not determine torrents (empty html body)') - rows = soup.body.find_all('tr') + raise ConnectionError("Could not determine torrents (empty html body)") + rows = soup.body.find_all("tr") torrents = [] for row in rows: - # Get the lowercase unique set from the row text - text_set = set(filter(None, re.split(r'[\s.|\(|\)]',row.text.lower()))) - # Check if search string is subset - if self._search_set.issubset(text_set): + if len(row.find_all("td", {"class": "vertTh"})) == 1: torrents.append(Torrent(row)) return torrents - - def getBestTorrent(self, min_seeds=30, min_filesize='1 GiB', max_filesize='4 GiB'): - '''Filters torrent list based on some constraints, then returns highest seeded torrent + + def getBestTorrent(self, min_seeds=30, min_filesize="1 GiB", max_filesize="4 GiB"): + """Filters torrent list based on some constraints, then returns highest seeded torrent :param min_seeds (int): minimum seed number filter :param min_filesize (str): minimum filesize in XiB form, eg. GiB :param max_filesize (str): maximum filesize in XiB form, eg. GiB - :return Torrent Object: Torrent with highest seed number, will return None if all are filtered out''' + :return Torrent Object: Torrent with highest seed number, will return None if all are filtered out""" if not isinstance(min_filesize, int): min_filesize = fileSizeStrToInt(min_filesize) if not isinstance(max_filesize, int): max_filesize = fileSizeStrToInt(max_filesize) - filtered_list = filter(lambda x: self._filterTorrent(x, min_seeds, min_filesize, max_filesize), self.list) + filtered_list = filter( + lambda x: self._filterTorrent(x, min_seeds, min_filesize, max_filesize), + self.list, + ) sorted_list = sorted(filtered_list, key=lambda x: x.seeds, reverse=True) if len(sorted_list) > 0: return sorted_list[0] else: - print('No torrents found given criteria') + print("No torrents found given criteria") return None - + def _filterTorrent(self, torrent, min_seeds, min_filesize, max_filesize): - if (torrent.seeds < min_seeds) or (torrent.byte_size < min_filesize) or (torrent.byte_size > max_filesize): + if ( + (torrent.seeds < min_seeds) + or (torrent.byte_size < min_filesize) + or (torrent.byte_size > max_filesize) + ): return False else: - return True \ No newline at end of file + return True diff --git a/tpblite/models/utils.py b/tpblite/models/utils.py index 1d6b351..eb24f8c 100644 --- a/tpblite/models/utils.py +++ b/tpblite/models/utils.py @@ -1,40 +1,63 @@ +from typing import Tuple, Type import random from urllib.request import Request, urlopen import urllib.error from purl import URL as pURL -class QueryParser(object): - ''' - Query object capable of getting html response given - a search query and other parameters. - ''' - def __init__(self, query, base_url, page, order, category): +class QueryParser: + """Query object capable of getting html response given a search query and other + parameters. + """ + + # PirateBay URL to use for queries + base_url: str + + # Compiled search string used to query the PirateBay URL + url: str + + def __init__(self, base_url: str, segments: Tuple[str, ...]): self.base_url = base_url - segments = ('search', query, str(page), str(order), str(category)) self.url = URL(base_url, segments) try: self.html_source = self._sendRequest() except urllib.error.URLError: - raise ConnectionError('Could not establish connection wtih {}'.format(self.base_url)) - + raise ConnectionError( + "Could not establish connection wtih {}".format(self.base_url) + ) + + @classmethod + def from_search( + cls, query: str, base_url: str, page: int, order: int, category: int + ): + segments = ("search", query, str(page), str(order), str(category)) + return cls(base_url, segments) + + @classmethod + def from_browse(cls, base_url: str, category: int, page: int, order: int): + print("browsing") + segments = ("browse", str(category), str(page), str(order), "0") + + return cls(base_url, segments) + def _sendRequest(self): req = Request(self.url, headers=headers()) return urlopen(req).read() -def URL(base, segments): + +def URL(base: str, segments: Tuple[str, ...]) -> str: u = pURL().from_string(base) url = u.path_segments(segments) return url.as_string() def headers(): - ''' + """ The Pirate Bay blocks requests (403 Forbidden) basing on User-Agent header, so it's probably better to rotate them. User-Agents taken from: https://techblog.willshouse.com/2012/01/03/most-common-user-agents/ - ''' + """ return { "User-Agent": random.choice(USER_AGENTS), "origin_req_host": "thepiratebay.se", @@ -42,16 +65,16 @@ def headers(): USER_AGENTS = ( - 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) ' - 'AppleWebKit/537.36 (KHTML, like Gecko) ' - 'Chrome/60.0.3112.113 Safari/537.36', - 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) ' - 'AppleWebKit/537.36 (KHTML, like Gecko) ' - 'Chrome/60.0.3112.101 Safari/537.36', - 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) ' - 'AppleWebKit/537.36 (KHTML, like Gecko) ' - 'Chrome/60.0.3112.113 Safari/537.36', - 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) ' - 'AppleWebKit/537.36 (KHTML, like Gecko) ' - 'Chrome/60.0.3112.113 Safari/537.36', -) \ No newline at end of file + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) " + "AppleWebKit/537.36 (KHTML, like Gecko) " + "Chrome/60.0.3112.113 Safari/537.36", + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) " + "AppleWebKit/537.36 (KHTML, like Gecko) " + "Chrome/60.0.3112.101 Safari/537.36", + "Mozilla/5.0 (Windows NT 6.1; Win64; x64) " + "AppleWebKit/537.36 (KHTML, like Gecko) " + "Chrome/60.0.3112.113 Safari/537.36", + "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) " + "AppleWebKit/537.36 (KHTML, like Gecko) " + "Chrome/60.0.3112.113 Safari/537.36", +) diff --git a/tpblite/tpblite.py b/tpblite/tpblite.py index a2a927d..9d153ac 100644 --- a/tpblite/tpblite.py +++ b/tpblite/tpblite.py @@ -1,34 +1,64 @@ +from typing import Optional + from .models.torrents import Torrents, Torrent from .models.utils import QueryParser -class TPB(object): - - def __init__(self, base_url='https://tpb.party'): - '''ThePirateBay Object + +class TPB: + + # PirateBay URL to use for queries + base_url: str + + # Compiled search string used to query the PirateBay URL + search_url: Optional[str] + + def __init__(self, base_url="https://tpb.party"): + """ThePirateBay Object Args: base_url (str): PirateBay URL to use for queries - Attributes: - search_url (str): This is the compiled search string used - to query the PirateBay URL, modified when calling search - method - ''' + """ self.base_url = base_url self.search_url = None - - def __str__(self): - return 'TPB Object, base URL: {}'.format(self.base_url) - - def search(self, query, page=0, order=99, category=0): - '''Search ThePirateBay and retturn list of Torrents + + def __str__(self) -> str: + return "TPB Object, base URL: {}".format(self.base_url) + + def search( + self, query: str, page: int = 0, order: int = 99, category: int = 0 + ) -> Torrent: + """Search ThePirateBay and return list of Torrents Args: - query (str): Search string to query ThePirateBay - page (int): page number to grab results from + query: Search string to query ThePirateBay + page: page number to grab results from order TODO category TODO - ''' - q = QueryParser(query, self.base_url, page, order, category) + + Return: + Torrent + + """ + q = QueryParser.from_search(query, self.base_url, page, order, category) + self.search_url = q.url + return Torrents(q.html_source) + + def browse( + self, category: int = 0, page: int = 0, order: int = 99 + ) -> Torrent: + """Browse ThePirateBay and return list of Torrents + + Args: + query: Search string to query ThePirateBay + page: page number to grab results from + order TODO + category TODO + + Return: + Torrent + + """ + q = QueryParser.from_browse(self.base_url, category, page, order) self.search_url = q.url - return Torrents(query, q.html_source) \ No newline at end of file + return Torrents(q.html_source) -- cgit v1.2.3 From ec4f40de81d75d54764ab16915aefd082585ea4a Mon Sep 17 00:00:00 2001 From: JPFrancoia Date: Wed, 25 Dec 2019 10:47:29 +0100 Subject: Addressing some comments in the PR (mainly code cleaning). --- tpblite/models/utils.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/tpblite/models/utils.py b/tpblite/models/utils.py index eb24f8c..1d5bda6 100644 --- a/tpblite/models/utils.py +++ b/tpblite/models/utils.py @@ -1,4 +1,4 @@ -from typing import Tuple, Type +from typing import Tuple import random from urllib.request import Request, urlopen import urllib.error @@ -35,9 +35,7 @@ class QueryParser: @classmethod def from_browse(cls, base_url: str, category: int, page: int, order: int): - print("browsing") segments = ("browse", str(category), str(page), str(order), "0") - return cls(base_url, segments) def _sendRequest(self): -- cgit v1.2.3 From 3435c5b357e7bd0bfc1ea1720230f534b14a5c15 Mon Sep 17 00:00:00 2001 From: JPFrancoia Date: Wed, 25 Dec 2019 11:06:01 +0100 Subject: Renaming QueryParser from_browse and from_search methods to browse and search. Adding type hinting for these two methods. --- tpblite/models/utils.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/tpblite/models/utils.py b/tpblite/models/utils.py index 1d5bda6..c6b6248 100644 --- a/tpblite/models/utils.py +++ b/tpblite/models/utils.py @@ -1,9 +1,12 @@ -from typing import Tuple +from typing import Tuple, TypeVar import random from urllib.request import Request, urlopen import urllib.error from purl import URL as pURL +# https://github.com/python/typing/issues/58#issuecomment-326240794 +T = TypeVar("T", bound="QueryParser") + class QueryParser: """Query object capable of getting html response given a search query and other @@ -27,14 +30,14 @@ class QueryParser: ) @classmethod - def from_search( + def search( cls, query: str, base_url: str, page: int, order: int, category: int - ): + ) -> T: segments = ("search", query, str(page), str(order), str(category)) return cls(base_url, segments) @classmethod - def from_browse(cls, base_url: str, category: int, page: int, order: int): + def browse(cls, base_url: str, category: int, page: int, order: int) -> T: segments = ("browse", str(category), str(page), str(order), "0") return cls(base_url, segments) -- cgit v1.2.3 From 836fa075264a1d1f43a434c2e134d6bf6cb57943 Mon Sep 17 00:00:00 2001 From: JPFrancoia Date: Wed, 25 Dec 2019 11:14:21 +0100 Subject: Adding comment explaining the 0 at the end of the browse URL. --- tpblite/models/utils.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tpblite/models/utils.py b/tpblite/models/utils.py index c6b6248..dd03b55 100644 --- a/tpblite/models/utils.py +++ b/tpblite/models/utils.py @@ -38,6 +38,7 @@ class QueryParser: @classmethod def browse(cls, base_url: str, category: int, page: int, order: int) -> T: + # The 0 is added to the URL to stay consistent with the manual web request segments = ("browse", str(category), str(page), str(order), "0") return cls(base_url, segments) -- cgit v1.2.3 From fdb0167687f1772ff4b2363010e417234061af04 Mon Sep 17 00:00:00 2001 From: JPFrancoia Date: Wed, 25 Dec 2019 11:54:00 +0100 Subject: search_url is now a private atttribute and was renamed _search_url. --- tpblite/tpblite.py | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) diff --git a/tpblite/tpblite.py b/tpblite/tpblite.py index 9d153ac..e4a0c58 100644 --- a/tpblite/tpblite.py +++ b/tpblite/tpblite.py @@ -9,10 +9,7 @@ class TPB: # PirateBay URL to use for queries base_url: str - # Compiled search string used to query the PirateBay URL - search_url: Optional[str] - - def __init__(self, base_url="https://tpb.party"): + def __init__(self, base_url: str = "https://tpb.party"): """ThePirateBay Object Args: @@ -20,7 +17,9 @@ class TPB: """ self.base_url = base_url - self.search_url = None + + # Compiled search string used to query the PirateBay URL + self._search_url: Optional[str] = None def __str__(self) -> str: return "TPB Object, base URL: {}".format(self.base_url) @@ -41,12 +40,10 @@ class TPB: """ q = QueryParser.from_search(query, self.base_url, page, order, category) - self.search_url = q.url + self._search_url = q.url return Torrents(q.html_source) - def browse( - self, category: int = 0, page: int = 0, order: int = 99 - ) -> Torrent: + def browse(self, category: int = 0, page: int = 0, order: int = 99) -> Torrent: """Browse ThePirateBay and return list of Torrents Args: @@ -60,5 +57,5 @@ class TPB: """ q = QueryParser.from_browse(self.base_url, category, page, order) - self.search_url = q.url + self._search_url = q.url return Torrents(q.html_source) -- cgit v1.2.3 From 4418d8c73b26639016733bc0a3264a96046c6ab1 Mon Sep 17 00:00:00 2001 From: JPFrancoia Date: Wed, 25 Dec 2019 17:11:09 +0100 Subject: Fixing mistakes related to type hinting. --- tpblite/models/utils.py | 6 +++--- tpblite/tpblite.py | 14 ++++++-------- 2 files changed, 9 insertions(+), 11 deletions(-) diff --git a/tpblite/models/utils.py b/tpblite/models/utils.py index dd03b55..6c479d4 100644 --- a/tpblite/models/utils.py +++ b/tpblite/models/utils.py @@ -1,4 +1,4 @@ -from typing import Tuple, TypeVar +from typing import Tuple, Type, TypeVar import random from urllib.request import Request, urlopen import urllib.error @@ -31,13 +31,13 @@ class QueryParser: @classmethod def search( - cls, query: str, base_url: str, page: int, order: int, category: int + cls: Type[T], query: str, base_url: str, page: int, order: int, category: int ) -> T: segments = ("search", query, str(page), str(order), str(category)) return cls(base_url, segments) @classmethod - def browse(cls, base_url: str, category: int, page: int, order: int) -> T: + def browse(cls: Type[T], base_url: str, category: int, page: int, order: int) -> T: # The 0 is added to the URL to stay consistent with the manual web request segments = ("browse", str(category), str(page), str(order), "0") return cls(base_url, segments) diff --git a/tpblite/tpblite.py b/tpblite/tpblite.py index e4a0c58..3c68d62 100644 --- a/tpblite/tpblite.py +++ b/tpblite/tpblite.py @@ -6,9 +6,6 @@ from .models.utils import QueryParser class TPB: - # PirateBay URL to use for queries - base_url: str - def __init__(self, base_url: str = "https://tpb.party"): """ThePirateBay Object @@ -16,6 +13,7 @@ class TPB: base_url (str): PirateBay URL to use for queries """ + # PirateBay URL to use for queries self.base_url = base_url # Compiled search string used to query the PirateBay URL @@ -26,7 +24,7 @@ class TPB: def search( self, query: str, page: int = 0, order: int = 99, category: int = 0 - ) -> Torrent: + ) -> Torrents: """Search ThePirateBay and return list of Torrents Args: @@ -36,14 +34,14 @@ class TPB: category TODO Return: - Torrent + Torrents """ - q = QueryParser.from_search(query, self.base_url, page, order, category) + q = QueryParser.search(query, self.base_url, page, order, category) self._search_url = q.url return Torrents(q.html_source) - def browse(self, category: int = 0, page: int = 0, order: int = 99) -> Torrent: + def browse(self, category: int = 0, page: int = 0, order: int = 99) -> Torrents: """Browse ThePirateBay and return list of Torrents Args: @@ -56,6 +54,6 @@ class TPB: Torrent """ - q = QueryParser.from_browse(self.base_url, category, page, order) + q = QueryParser.browse(self.base_url, category, page, order) self._search_url = q.url return Torrents(q.html_source) -- cgit v1.2.3