|
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290 |
-
- import logging
- import re
- import itertools
- import time
- import json
-
- from jps2sm.myloginsession import jpopsuki, sugoimusic
- from jps2sm.constants import Categories
- from jps2sm.utils import remove_html_tags
-
-
- from bs4 import BeautifulSoup
-
- logger = logging.getLogger('main.' + __name__)
-
-
- class GetGroupData:
- """
- Retrieve group data of the group supplied from args.parsed.urls
- Group data is defined as data that is constant for every release, eg category, artist, title, groupdescription, tags etc.
- Each property is gathered by calling a method of the class
- """
-
- def __init__(self, jpsurl):
- self.jpsurl = jpsurl
- logger.debug(f'Processing JPS URL: {jpsurl}')
- self.groupid: int = int()
- self.category: str = str()
- self.artist: str = str()
- self.date: str = str()
- self.title: str = str()
- self.originalartist: str = str()
- self.originaltitle: str = str()
- self.rel2: str = str()
- self.groupdescription: str = str()
- self.imagelink: str = str()
- self.tagsall: str = str()
- self.contribartists: str = str()
-
- self.getdata()
-
- def getdata(self):
- date_regex = r'[12]\d{3}\.(?:0[1-9]|1[0-2])\.(?:0[1-9]|[12]\d|3[01])'
-
- date_regex2 = r'(?:[12]\d{3}\.(?:0[1-9]|1[0-2])\.(?:0[1-9]|[12]\d|3[01])|(?:19|20)\d\d)'
-
- res = jpopsuki(self.jpsurl.split()[0])
-
- self.groupid = re.findall(r"(?!id=)\d+", self.jpsurl)[0]
-
- soup = BeautifulSoup(res.text, 'html5lib')
- artistlinelink = soup.select('.thin h2 a')
- originaltitleline = soup.select('.thin h3')
-
- logger.debug(torrent_description_page_h2_line := str(soup.select('.thin h2')[0]))
-
- self.category = re.findall(r'\[(.*?)\]', torrent_description_page_h2_line)[0]
- logger.info(f'Category: {self.category}')
-
- try:
- artist_raw = re.findall(r'<a[^>]+>(.*)<', str(artistlinelink[0]))[0]
- self.artist = split_bad_multiple_artists(artist_raw)
- except IndexError:
- if self.category == "Pictures":
-
-
- try:
- self.artist = re.findall(fr'\[Pictures\] ([A-Za-z\. ]+) (?:{date_regex2})', torrent_description_page_h2_line)
- except IndexError:
- logger.exception('Cannot find artist')
- raise
- elif self.category == "Misc":
-
- try:
- artist_raw = re.findall(r'\[Misc\] ([A-Za-z\, ]+) - ', torrent_description_page_h2_line)[0]
- except IndexError:
- logger.exception('Cannot find artist')
- raise
- self.artist = split_bad_multiple_artists(artist_raw)
- else:
- logger.exception('JPS upload appears to have no artist set and artist cannot be autodetected')
- raise
-
- logger.info(f'Artist(s): {self.artist}')
-
-
- try:
- self.date = re.findall(date_regex, torrent_description_page_h2_line)[0].replace(".", "")
- except IndexError:
- try:
- self.date = re.findall(r'[^\d]((?:19|20)\d{2})[^\d]', torrent_description_page_h2_line)[0]
-
-
- except IndexError:
- if self.category not in Categories.NonDate:
- logger.exception(f'Group release date not found and not using upload date instead as {self.category} torrents should have it set')
- else:
- logger.warning('Date not found from group data, will use upload date as the release date')
- self.date = None
- pass
-
- logger.info(f'Release date: {self.date}')
-
- if self.category not in Categories.NonDate:
- self.title = re.findall(r'<a.*> - (.*) \[', torrent_description_page_h2_line)[0]
- else:
-
- title1 = re.findall(r'<a.*> - (?:[12]\d{3}\.(?:0[1-9]|1[0-2])\.(?:0[1-9]|[12]\d|3[01])) - (.*)</h2>', torrent_description_page_h2_line)
- title2 = re.findall(r'<a.*> - (.*) \((.*) (?:[12]\d{3}\.(?:0[1-9]|1[0-2])\.(?:0[1-9]|[12]\d|3[01]))', torrent_description_page_h2_line)
-
- titlemergedpre = [title1, " ".join(itertools.chain(*title2))]
- titlemerged = "".join(itertools.chain(*titlemergedpre))
- if len(titlemerged) == 0:
- try:
- self.title = re.findall(r'<a.*> - (.*)</h2>', torrent_description_page_h2_line)[0]
- except IndexError:
- if self.category == "Pictures":
-
-
- try:
- self.title = re.findall(fr'\[Pictures\] (?:[A-Za-z\. ]+) ({date_regex2}(?:.*))</h2>', torrent_description_page_h2_line)[0]
- except IndexError:
- logger.exception('Cannot find title from the JPS upload')
- raise
- elif self.category == "Misc":
- try:
- self.title = re.findall(r'\[Misc\] (?:[A-Za-z\, ]+) - (.+)</h2>', torrent_description_page_h2_line)[0]
- except IndexError:
- logger.exception('Cannot find title from the JPS upload')
- raise
- else:
- logger.exception('Cannot find title from the JPS upload')
- raise
- else:
- self.title = titlemerged
-
- logger.info(f'Title: {self.title}')
- try:
- originalchars = re.findall(r'<a href="artist.php\?id=(?:[0-9]+)">(.+)</a> - (.+)\)</h3>', str(originaltitleline))[0]
- self.originalartist = originalchars[0]
- self.originaltitle = originalchars[1]
- logger.info(f"Original artist: {self.originalartist} Original title: {self.originaltitle}")
- except IndexError:
- pass
-
- self.rel2 = str(soup.select('#content .thin .main_column .torrent_table tbody')[0])
-
-
- try:
- self.groupdescription = get_group_descrption_bbcode(self.groupid)
- except:
- logger.exception('Could not get group description BBCode. Are you a Power User+ at JPS?')
- self.groupdescription = remove_html_tags(str(soup.select('#content .thin .main_column .box .body')[0]))
-
- logger.info(f"Group description:\n{self.groupdescription}")
-
- image = str(soup.select('#content .thin .sidebar .box p a'))
- try:
- self.imagelink = "https://jpopsuki.eu/" + re.findall('<a\s+(?:[^>]*?\s+)?href=\"([^\"]*)\"', image)[0]
- logger.info(f'Image link: {self.imagelink}')
- except IndexError:
- self.imagelink = None
-
- tagsget = str(soup.select('#content .thin .sidebar .box ul.stats.nobullet li'))
- tags = re.findall('searchtags=([^\"]+)', tagsget)
- logger.info(f'Tags: {tags}')
- self.tagsall = ",".join(tags)
-
- try:
- contribartistsget = str(soup.select('#content .thin .sidebar .box .body ul.stats.nobullet li'))
- contribartistslist = re.findall(r'<li><a href="artist\.php\?id=(?:[0-9]+?)" title="([^"]*?)">([\w .-]+)</a>', contribartistsget)
- self.contribartists = {}
- for artistpair in contribartistslist:
- self.contribartists[artistpair[1]] = artistpair[0]
-
- logger.info(f'Contributing artists: {self.contribartists}')
- except IndexError:
- pass
-
- def originalchars(self):
- return self.originalartist, self.originaltitle
-
- def __getattr__(self, item):
- return self.item
-
-
- def split_bad_multiple_artists(artists):
- return re.split(', | x | & ', artists)
-
-
- def get_release_data(torrentids, release_data, date):
- """
- Retrieve all torrent id and release data (slash separated data and upload date) whilst coping with 'noise' from FL torrents,
- and either return all data if using a group URL or only return the relevant releases if release url(s) were used
-
- :param torrentids: list of torrentids to be processed, NULL if group is used
- :return: releasedata: 2d dict of release data in the format of torrentid: { "slashdata" : [ slashdatalist ] , "uploaddate": uploaddate } .
- """
-
- freeleechtext = '<strong>Freeleech!</strong>'
- releasedatapre = re.findall(r"swapTorrent\('([0-9]+)'\);\">» (.*?)</a>.*?<blockquote>(?:\s*)Uploaded by <a href=\"user.php\?id=(?:[0-9]+)\">(?:[\S]+)</a> on <span title=\"(?:[^\"]+)\">([^<]+)</span>", release_data, re.DOTALL)
-
-
-
- releasedata = {}
- for release in releasedatapre:
- torrentid = release[0]
- slashlist = ([i.split(' / ') for i in [release[1]]])[0]
- uploadeddate = release[2]
- releasedata[torrentid] = {}
- releasedata[torrentid]['slashdata'] = slashlist
- releasedata[torrentid]['uploaddate'] = uploadeddate
-
- logger.debug(f'Entire group contains: {json.dumps(releasedata, indent=2)}')
-
- removetorrents = []
- for torrentid, release in releasedata.items():
- if len(torrentids) != 0 and torrentid not in torrentids:
-
-
- removetorrents.append(torrentid)
- if freeleechtext in release['slashdata']:
- release['slashdata'].remove(freeleechtext)
- for index, slashreleaseitem in enumerate(release['slashdata']):
- if remaster_freeleech_removed := re.findall(r'(.*) - <strong>Freeleech!<\/strong>', slashreleaseitem):
- release['slashdata'][index] = f'{remaster_freeleech_removed[0]} - {date[:4]}'
- logger.debug(f"Torrent {torrentid} is freeleech remastered, validated remasterdata to {release['slashdata'][index]}")
- for torrentid in removetorrents:
- del (releasedata[torrentid])
-
- logger.info(f'Selected for upload: {releasedata}')
- return releasedata
-
-
- def get_group_descrption_bbcode(groupid):
- """
- Retrieve original bbcode from edit group url and reformat any JPS style bbcode
-
- :param: groupid: JPS groupid to get group description with bbcode
- :return: bbcode: group description with bbcode
- """
- edit_group_page = jpopsuki(f"https://jpopsuki.eu/torrents.php?action=editgroup&groupid={groupid}")
- soup = BeautifulSoup(edit_group_page.text, 'html5lib')
- bbcode = soup.find("textarea", {"name": "body"}).string
-
- bbcode_sanitised = re.sub(r'\[youtube=([^\]]+)]', r'[youtube]\1[/youtube]', bbcode)
-
- return bbcode_sanitised
-
-
- def get_jps_user_id():
- """
- Returns the JPopSuki user id
- :return: int: user id
- """
-
- res = jpopsuki("https://jpopsuki.eu/", True)
- soup = BeautifulSoup(res.text, 'html5lib')
- href = soup.select('.username')[0]['href']
- jps_user_id = re.match(r"user\.php\?id=(\d+)", href).group(1)
- time.sleep(5)
-
- return int(str(jps_user_id))
-
-
- def get_user_keys():
- """
- Get SM session authkey and torrent_password_key for use by uploadtorrent()|download_sm_torrent() data dict.
- Uses SM login data
- """
- smpage = sugoimusic("https://sugoimusic.me/torrents.php?id=118", test_login=True)
- soup = BeautifulSoup(smpage.text, 'html5lib')
- rel2 = str(soup.select_one('#torrent_details .group_torrent > td > span > .tooltip'))
-
- return {
- 'authkey': re.findall('authkey=(.*)&torrent_pass=', rel2)[0],
- 'torrent_password_key': re.findall(r"torrent_pass=(.+)\" title", rel2)[0]
- }
-
-
- def get_torrent_link(torrentid, release_data):
- """
- Extract a torrent link for a given torrentid
-
- :param torrentid:
- :return: torrentlink: URI of torrent link
- """
- torrentlink = re.findall(rf'torrents\.php\?action=download&id={torrentid}&authkey=(?:[^&]+)&torrent_pass=(?:[^"]+)', release_data)[0]
- return torrentlink
|