# Standard library packages
import logging
import re
import itertools
import time
import json
from jps2sm.myloginsession import jpopsuki, sugoimusic
from jps2sm.constants import Categories
from jps2sm.utils import remove_html_tags
# Third-party packages
from bs4 import BeautifulSoup
logger = logging.getLogger('main.' + __name__)
class GetGroupData:
"""
Retrieve group data of the group supplied from args.parsed.urls
Group data is defined as data that is constant for every release, eg category, artist, title, groupdescription, tags etc.
Each property is gathered by calling a method of the class
"""
def __init__(self, jpsurl):
self.jpsurl = jpsurl
logger.debug(f'Processing JPS URL: {jpsurl}')
self.groupid: int = int()
self.category: str = str()
self.artist: str = str()
self.date: str = str()
self.title: str = str()
self.originalartist: str = str()
self.originaltitle: str = str()
self.rel2: str = str()
self.groupdescription: str = str()
self.imagelink: str = str()
self.tagsall: str = str()
self.contribartists: str = str()
self.getdata()
def getdata(self):
date_regex = r'[12]\d{3}\.(?:0[1-9]|1[0-2])\.(?:0[1-9]|[12]\d|3[01])' # YYYY.MM.DD format
# YYYY.MM.DD OR YYYY format, for Pictures only
date_regex2 = r'(?:[12]\d{3}\.(?:0[1-9]|1[0-2])\.(?:0[1-9]|[12]\d|3[01])|(?:19|20)\d\d)'
res = jpopsuki(self.jpsurl.split()[0]) # If there are multiple urls only the first url needs to be parsed
self.groupid = re.findall(r"(?!id=)\d+", self.jpsurl)[0]
soup = BeautifulSoup(res.text, 'html5lib')
artistlinelink = soup.select('.thin h2 a')
originaltitleline = soup.select('.thin h3')
logger.debug(torrent_description_page_h2_line := str(soup.select('.thin h2')[0]))
self.category = re.findall(r'\[(.*?)\]', torrent_description_page_h2_line)[0]
logger.info(f'Category: {self.category}')
try:
artist_raw = re.findall(r']+>(.*)<', str(artistlinelink[0]))[0]
self.artist = split_bad_multiple_artists(artist_raw)
except IndexError: # Cannot find artist
if self.category == "Pictures":
# JPS allows Picture torrents to have no artist set, in this scenario try to infer the artist by examining the text
# immediately after the category string up to a YYYY.MM.DD string if available as this should be the magazine title
try:
self.artist = re.findall(fr'\[Pictures\] ([A-Za-z\. ]+) (?:{date_regex2})', torrent_description_page_h2_line)
except IndexError:
logger.exception('Cannot find artist')
raise
elif self.category == "Misc":
# JPS has some older groups with no artists set, uploaders still used the "Artist - Group name" syntax though
try:
artist_raw = re.findall(r'\[Misc\] ([A-Za-z\, ]+) - ', torrent_description_page_h2_line)[0]
except IndexError:
logger.exception('Cannot find artist')
raise
self.artist = split_bad_multiple_artists(artist_raw)
else:
logger.exception('JPS upload appears to have no artist set and artist cannot be autodetected')
raise
logger.info(f'Artist(s): {self.artist}')
# Extract date without using '[]' as it allows '[]' elsewhere in the title and it works with JPS TV-* categories
try:
self.date = re.findall(date_regex, torrent_description_page_h2_line)[0].replace(".", "")
except IndexError: # Handle YYYY dates, creating extra regex as I cannot get it working without causing issue #33
try:
self.date = re.findall(r'[^\d]((?:19|20)\d{2})[^\d]', torrent_description_page_h2_line)[0]
# Handle if cannot find date in the title, use upload date instead from getreleasedata() but error if the category should have it
except IndexError:
if self.category not in Categories.NonDate:
logger.exception(f'Group release date not found and not using upload date instead as {self.category} torrents should have it set')
else:
logger.warning('Date not found from group data, will use upload date as the release date')
self.date = None
pass
logger.info(f'Release date: {self.date}')
if self.category not in Categories.NonDate:
self.title = re.findall(r' - (.*) \[', torrent_description_page_h2_line)[0]
else:
# Using two sets of findall() as I cannot get the OR regex operator "|" to work
title1 = re.findall(r' - (?:[12]\d{3}\.(?:0[1-9]|1[0-2])\.(?:0[1-9]|[12]\d|3[01])) - (.*)', torrent_description_page_h2_line)
title2 = re.findall(r' - (.*) \((.*) (?:[12]\d{3}\.(?:0[1-9]|1[0-2])\.(?:0[1-9]|[12]\d|3[01]))', torrent_description_page_h2_line)
# title1 has 1 matching group, title2 has 2
titlemergedpre = [title1, " ".join(itertools.chain(*title2))]
titlemerged = "".join(itertools.chain(*titlemergedpre))
if len(titlemerged) == 0: # Non standard title, fallback on the whole string after the "-"
try:
self.title = re.findall(r' - (.*)', torrent_description_page_h2_line)[0]
except IndexError:
if self.category == "Pictures": # Pictures non-artist upload - for magazines
# Fallback to all the text after the category, we need to include the date stamp as magazines are often titled
# with the same numbers each year - the first magazine each year appears to always be 'No. 1' for example
try:
self.title = re.findall(fr'\[Pictures\] (?:[A-Za-z\. ]+) ({date_regex2}(?:.*))', torrent_description_page_h2_line)[0]
except IndexError:
logger.exception('Cannot find title from the JPS upload')
raise
elif self.category == "Misc":
try:
self.title = re.findall(r'\[Misc\] (?:[A-Za-z\, ]+) - (.+)', torrent_description_page_h2_line)[0]
except IndexError:
logger.exception('Cannot find title from the JPS upload')
raise
else:
logger.exception('Cannot find title from the JPS upload')
raise
else:
self.title = titlemerged
logger.info(f'Title: {self.title}')
try:
originalchars = re.findall(r'(.+) - (.+)\)', str(originaltitleline))[0]
self.originalartist = originalchars[0]
self.originaltitle = originalchars[1]
logger.info(f"Original artist: {self.originalartist} Original title: {self.originaltitle}")
except IndexError: # Do nothing if group has no original artist/title
pass
self.rel2 = str(soup.select('#content .thin .main_column .torrent_table tbody')[0])
# Get description with BB Code if user has group edit permissions on JPS, if not just use stripped html text.
try:
self.groupdescription = get_group_descrption_bbcode(self.groupid) # Requires PU+ at JPS
except:
logger.exception('Could not get group description BBCode. Are you a Power User+ at JPS?')
self.groupdescription = remove_html_tags(str(soup.select('#content .thin .main_column .box .body')[0]))
logger.info(f"Group description:\n{self.groupdescription}")
image = str(soup.select('#content .thin .sidebar .box p a'))
try:
self.imagelink = "https://jpopsuki.eu/" + re.findall(']*?\s+)?href=\"([^\"]*)\"', image)[0]
logger.info(f'Image link: {self.imagelink}')
except IndexError: # No image for the group
self.imagelink = None
tagsget = str(soup.select('#content .thin .sidebar .box ul.stats.nobullet li'))
tags = re.findall('searchtags=([^\"]+)', tagsget)
logger.info(f'Tags: {tags}')
self.tagsall = ",".join(tags)
try:
contribartistsget = str(soup.select('#content .thin .sidebar .box .body ul.stats.nobullet li'))
contribartistslist = re.findall(r'([\w .-]+)', contribartistsget)
self.contribartists = {}
for artistpair in contribartistslist:
self.contribartists[artistpair[1]] = artistpair[0] # Creates contribartists[artist] = origartist
logger.info(f'Contributing artists: {self.contribartists}')
except IndexError: # Do nothing if group has no contrib artists
pass
def originalchars(self):
return self.originalartist, self.originaltitle
def __getattr__(self, item):
return self.item
def split_bad_multiple_artists(artists):
return re.split(', | x | & ', artists)
def get_release_data(torrentids, release_data, date):
"""
Retrieve all torrent id and release data (slash separated data and upload date) whilst coping with 'noise' from FL torrents,
and either return all data if using a group URL or only return the relevant releases if release url(s) were used
:param torrentids: list of torrentids to be processed, NULL if group is used
:return: releasedata: 2d dict of release data in the format of torrentid: { "slashdata" : [ slashdatalist ] , "uploaddate": uploaddate } .
"""
freeleechtext = 'Freeleech!'
releasedatapre = re.findall(r"swapTorrent\('([0-9]+)'\);\">ยป (.*?).*?(?:\s*)Uploaded by (?:[\S]+) on ([^<]+)", release_data, re.DOTALL)
# if args.parsed.debug:
# print(f'Pre-processed releasedata: {json.dumps(releasedatapre, indent=2)}')
releasedata = {}
for release in releasedatapre:
torrentid = release[0]
slashlist = ([i.split(' / ') for i in [release[1]]])[0]
uploadeddate = release[2]
releasedata[torrentid] = {}
releasedata[torrentid]['slashdata'] = slashlist
releasedata[torrentid]['uploaddate'] = uploadeddate
logger.debug(f'Entire group contains: {json.dumps(releasedata, indent=2)}')
removetorrents = []
for torrentid, release in releasedata.items(): # Now release is a dict!
if len(torrentids) != 0 and torrentid not in torrentids:
# If len(torrentids) != 0 then user has supplied a group url and every release is processed,
# otherwise iterate through releasedata{} and remove what is not needed
removetorrents.append(torrentid)
if freeleechtext in release['slashdata']:
release['slashdata'].remove(freeleechtext) # Remove Freeleech whole match so it does not interfere with Remastered
for index, slashreleaseitem in enumerate(release['slashdata']):
if remaster_freeleech_removed := re.findall(r'(.*) - Freeleech!<\/strong>', slashreleaseitem): # Handle Freeleech remastered torrents, issue #43
release['slashdata'][index] = f'{remaster_freeleech_removed[0]} - {date[:4]}' # Use the extracted value and append group JPS release year
logger.debug(f"Torrent {torrentid} is freeleech remastered, validated remasterdata to {release['slashdata'][index]}")
for torrentid in removetorrents:
del (releasedata[torrentid])
logger.info(f'Selected for upload: {releasedata}')
return releasedata
def get_group_descrption_bbcode(groupid):
"""
Retrieve original bbcode from edit group url and reformat any JPS style bbcode
:param: groupid: JPS groupid to get group description with bbcode
:return: bbcode: group description with bbcode
"""
edit_group_page = jpopsuki(f"https://jpopsuki.eu/torrents.php?action=editgroup&groupid={groupid}")
soup = BeautifulSoup(edit_group_page.text, 'html5lib')
bbcode = soup.find("textarea", {"name": "body"}).string
bbcode_sanitised = re.sub(r'\[youtube=([^\]]+)]', r'[youtube]\1[/youtube]', bbcode)
return bbcode_sanitised
def get_jps_user_id():
"""
Returns the JPopSuki user id
:return: int: user id
"""
res = jpopsuki("https://jpopsuki.eu/", True)
soup = BeautifulSoup(res.text, 'html5lib')
href = soup.select('.username')[0]['href']
jps_user_id = re.match(r"user\.php\?id=(\d+)", href).group(1)
time.sleep(5) # Sleep as otherwise we hit JPS browse quota
return int(str(jps_user_id))
def get_user_keys():
"""
Get SM session authkey and torrent_password_key for use by uploadtorrent()|download_sm_torrent() data dict.
Uses SM login data
"""
smpage = sugoimusic("https://sugoimusic.me/torrents.php?id=118", test_login=True) # Arbitrary page on JPS that has authkey
soup = BeautifulSoup(smpage.text, 'html5lib')
rel2 = str(soup.select_one('#torrent_details .group_torrent > td > span > .tooltip'))
return {
'authkey': re.findall('authkey=(.*)&torrent_pass=', rel2)[0],
'torrent_password_key': re.findall(r"torrent_pass=(.+)\" title", rel2)[0]
}
def get_torrent_link(torrentid, release_data):
"""
Extract a torrent link for a given torrentid
:param torrentid:
:return: torrentlink: URI of torrent link
"""
torrentlink = re.findall(rf'torrents\.php\?action=download&id={torrentid}&authkey=(?:[^&]+)&torrent_pass=(?:[^"]+)', release_data)[0]
return torrentlink