aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLoic Coyle <loic.coyle@hotmail.fr>2020-08-22 20:47:18 +0200
committerLoic Coyle <loic.coyle@hotmail.fr>2020-08-22 22:11:26 +0200
commit526f38208dd35e57cd419de6f4248f4cfde1faca (patch)
tree4c37994a0d511aafaa40ee93547d04c88094ca43
parente9e1c637b95f609a3053c243f2f5837d5214f3c0 (diff)
downloadtpb-lite-526f38208dd35e57cd419de6f4248f4cfde1faca.tar.gz
tpb-lite-526f38208dd35e57cd419de6f4248f4cfde1faca.tar.bz2
tpb-lite-526f38208dd35e57cd419de6f4248f4cfde1faca.zip
replace beautifulsoup4 with lxml and remove purl dependency
-rw-r--r--README.md3
-rw-r--r--setup.py6
-rw-r--r--tpblite/models/torrents.py27
-rw-r--r--tpblite/models/utils.py10
4 files changed, 20 insertions, 26 deletions
diff --git a/README.md b/README.md
index 22e725f..3c268d2 100644
--- a/README.md
+++ b/README.md
@@ -7,8 +7,7 @@ $ pip install tpblite
```
Dependencies:
- - BeautifulSoup
- - purl
+ - lxml
Usage
==========
diff --git a/setup.py b/setup.py
index 4a48939..21aeebf 100644
--- a/setup.py
+++ b/setup.py
@@ -21,9 +21,7 @@ setup(name = 'tpblite',
long_description_content_type='text/markdown',
license = 'MIT License',
packages = ['tpblite', 'tpblite/models'],
- install_requires = [
- 'beautifulsoup4',
- 'purl'],
+ install_requires = ['lxml'],
classifiers = [
'Development Status :: 3 - Alpha',
'Programming Language :: Python',
@@ -33,4 +31,4 @@ setup(name = 'tpblite',
'Topic :: Internet :: WWW/HTTP :: Indexing/Search',
'Topic :: Utilities'],
keywords = ['ThePirateBay', 'PirateBay', 'torrent']
- ) \ No newline at end of file
+ )
diff --git a/tpblite/models/torrents.py b/tpblite/models/torrents.py
index 5145289..4eb5a16 100644
--- a/tpblite/models/torrents.py
+++ b/tpblite/models/torrents.py
@@ -1,6 +1,5 @@
-import re
import unicodedata
-from bs4 import BeautifulSoup
+from lxml.etree import HTML
# TODO: write better comments
@@ -44,19 +43,18 @@ class Torrent:
return "<Torrent object: {}>".format(self.title)
def _getTitle(self):
- return self.html_row.find("a", class_="detLink").string
+ return self.html_row.findtext('.//a[@class="detLink"]')
def _getMagnetLink(self):
- tag = self.html_row.find("a", href=(re.compile("magnet")))
- link = tag.get("href")
- return link
+ return self.html_row.xpath('.//a[starts-with(@href, "magnet")]/@href')[0]
def _getPeers(self):
- taglist = self.html_row.find_all("td", align="right")
- return int(taglist[0].string), int(taglist[1].string)
+ taglist = self.html_row.xpath('.//td[@align="right"]/text()')
+ return int(taglist[0]), int(taglist[1])
def _getFileInfo(self):
- text = self.html_row.find("font", class_="detDesc").get_text()
+ text = self.html_row.xpath('.//font[@class="detDesc"]/descendant::text()')
+ text = ''.join(text)
t = text.split(",")
uptime = unicodedata.normalize("NFKD", t[0].replace("Uploaded ", "").strip())
size = unicodedata.normalize("NFKD", t[1].replace("Size ", "").strip())
@@ -65,7 +63,7 @@ class Torrent:
return uptime, size, byte_size, uploader
def _getUrl(self):
- tag = self.html_row.find("a", class_="detLink")
+ tag = self.html_row.find('.//a[@class="detLink"]')
return tag.get("href")
@@ -96,14 +94,13 @@ class Torrents:
return self.list[index]
def _createTorrentList(self):
- soup = BeautifulSoup(self.html_source, features="html.parser")
- if soup.body is None:
+ root = HTML(self.html_source)
+ if root.find("body") is None:
raise ConnectionError("Could not determine torrents (empty html body)")
- rows = soup.body.find_all("tr")
+ rows = root.xpath('//tr[td[@class="vertTh"]]')
torrents = []
for row in rows:
- if len(row.find_all("td", {"class": "vertTh"})) == 1:
- torrents.append(Torrent(row))
+ torrents.append(Torrent(row))
return torrents
def getBestTorrent(self, min_seeds=30, min_filesize="1 GiB", max_filesize="4 GiB"):
diff --git a/tpblite/models/utils.py b/tpblite/models/utils.py
index b9126e9..c5b5cfd 100644
--- a/tpblite/models/utils.py
+++ b/tpblite/models/utils.py
@@ -1,8 +1,8 @@
from typing import Tuple, Type, TypeVar
import random
from urllib.request import Request, urlopen
+from urllib.parse import urlparse, urlunparse, quote
import urllib.error
-from purl import URL as pURL
# https://github.com/python/typing/issues/58#issuecomment-326240794
T = TypeVar("T", bound="QueryParser")
@@ -26,7 +26,7 @@ class QueryParser:
self.html_source = self._sendRequest()
except urllib.error.URLError:
raise ConnectionError(
- "Could not establish connection wtih {}".format(self.base_url)
+ "Could not establish connection with {}".format(self.url)
)
@classmethod
@@ -58,9 +58,9 @@ class QueryParser:
def URL(base: str, segments: Tuple[str, ...]) -> str:
- u = pURL().from_string(base)
- url = u.path_segments(segments)
- return url.as_string()
+ url = list(urlparse(base))
+ url[2] = '/'.join((quote(s) for s in segments))
+ return urlunparse(url)
def headers():