Você não pode selecionar mais de 25 tópicos Os tópicos devem começar com uma letra ou um número, podem incluir traços ('-') e podem ter até 35 caracteres.

get_data.py 14 KiB

3 anos atrás
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290
  1. # Standard library packages
  2. import logging
  3. import re
  4. import itertools
  5. import time
  6. import json
  7. from jps2sm.myloginsession import jpopsuki, sugoimusic
  8. from jps2sm.constants import Categories
  9. from jps2sm.utils import remove_html_tags
  10. # Third-party packages
  11. from bs4 import BeautifulSoup
  12. logger = logging.getLogger('main.' + __name__)
  13. class GetGroupData:
  14. """
  15. Retrieve group data of the group supplied from args.parsed.urls
  16. Group data is defined as data that is constant for every release, eg category, artist, title, groupdescription, tags etc.
  17. Each property is gathered by calling a method of the class
  18. """
  19. def __init__(self, jpsurl):
  20. self.jpsurl = jpsurl
  21. logger.debug(f'Processing JPS URL: {jpsurl}')
  22. self.groupid: int = int()
  23. self.category: str = str()
  24. self.artist: str = str()
  25. self.date: str = str()
  26. self.title: str = str()
  27. self.originalartist: str = str()
  28. self.originaltitle: str = str()
  29. self.rel2: str = str()
  30. self.groupdescription: str = str()
  31. self.imagelink: str = str()
  32. self.tagsall: str = str()
  33. self.contribartists: str = str()
  34. self.getdata()
  35. def getdata(self):
  36. date_regex = r'[12]\d{3}\.(?:0[1-9]|1[0-2])\.(?:0[1-9]|[12]\d|3[01])' # YYYY.MM.DD format
  37. # YYYY.MM.DD OR YYYY format, for Pictures only
  38. date_regex2 = r'(?:[12]\d{3}\.(?:0[1-9]|1[0-2])\.(?:0[1-9]|[12]\d|3[01])|(?:19|20)\d\d)'
  39. res = jpopsuki(self.jpsurl.split()[0]) # If there are multiple urls only the first url needs to be parsed
  40. self.groupid = re.findall(r"(?!id=)\d+", self.jpsurl)[0]
  41. soup = BeautifulSoup(res.text, 'html5lib')
  42. artistlinelink = soup.select('.thin h2 a')
  43. originaltitleline = soup.select('.thin h3')
  44. logger.debug(torrent_description_page_h2_line := str(soup.select('.thin h2')[0]))
  45. self.category = re.findall(r'\[(.*?)\]', torrent_description_page_h2_line)[0]
  46. logger.info(f'Category: {self.category}')
  47. try:
  48. artist_raw = re.findall(r'<a[^>]+>(.*)<', str(artistlinelink[0]))[0]
  49. self.artist = split_bad_multiple_artists(artist_raw)
  50. except IndexError: # Cannot find artist
  51. if self.category == "Pictures":
  52. # JPS allows Picture torrents to have no artist set, in this scenario try to infer the artist by examining the text
  53. # immediately after the category string up to a YYYY.MM.DD string if available as this should be the magazine title
  54. try:
  55. self.artist = re.findall(fr'\[Pictures\] ([A-Za-z\. ]+) (?:{date_regex2})', torrent_description_page_h2_line)
  56. except IndexError:
  57. logger.exception('Cannot find artist')
  58. raise
  59. elif self.category == "Misc":
  60. # JPS has some older groups with no artists set, uploaders still used the "Artist - Group name" syntax though
  61. try:
  62. artist_raw = re.findall(r'\[Misc\] ([A-Za-z\, ]+) - ', torrent_description_page_h2_line)[0]
  63. except IndexError:
  64. logger.exception('Cannot find artist')
  65. raise
  66. self.artist = split_bad_multiple_artists(artist_raw)
  67. else:
  68. logger.exception('JPS upload appears to have no artist set and artist cannot be autodetected')
  69. raise
  70. logger.info(f'Artist(s): {self.artist}')
  71. # Extract date without using '[]' as it allows '[]' elsewhere in the title and it works with JPS TV-* categories
  72. try:
  73. self.date = re.findall(date_regex, torrent_description_page_h2_line)[0].replace(".", "")
  74. except IndexError: # Handle YYYY dates, creating extra regex as I cannot get it working without causing issue #33
  75. try:
  76. self.date = re.findall(r'[^\d]((?:19|20)\d{2})[^\d]', torrent_description_page_h2_line)[0]
  77. # Handle if cannot find date in the title, use upload date instead from getreleasedata() but error if the category should have it
  78. except IndexError:
  79. if self.category not in Categories.NonDate:
  80. logger.exception(f'Group release date not found and not using upload date instead as {self.category} torrents should have it set')
  81. else:
  82. logger.warning('Date not found from group data, will use upload date as the release date')
  83. self.date = None
  84. pass
  85. logger.info(f'Release date: {self.date}')
  86. if self.category not in Categories.NonDate:
  87. self.title = re.findall(r'<a.*> - (.*) \[', torrent_description_page_h2_line)[0]
  88. else:
  89. # Using two sets of findall() as I cannot get the OR regex operator "|" to work
  90. title1 = re.findall(r'<a.*> - (?:[12]\d{3}\.(?:0[1-9]|1[0-2])\.(?:0[1-9]|[12]\d|3[01])) - (.*)</h2>', torrent_description_page_h2_line)
  91. title2 = re.findall(r'<a.*> - (.*) \((.*) (?:[12]\d{3}\.(?:0[1-9]|1[0-2])\.(?:0[1-9]|[12]\d|3[01]))', torrent_description_page_h2_line)
  92. # title1 has 1 matching group, title2 has 2
  93. titlemergedpre = [title1, " ".join(itertools.chain(*title2))]
  94. titlemerged = "".join(itertools.chain(*titlemergedpre))
  95. if len(titlemerged) == 0: # Non standard title, fallback on the whole string after the "-"
  96. try:
  97. self.title = re.findall(r'<a.*> - (.*)</h2>', torrent_description_page_h2_line)[0]
  98. except IndexError:
  99. if self.category == "Pictures": # Pictures non-artist upload - for magazines
  100. # Fallback to all the text after the category, we need to include the date stamp as magazines are often titled
  101. # with the same numbers each year - the first magazine each year appears to always be 'No. 1' for example
  102. try:
  103. self.title = re.findall(fr'\[Pictures\] (?:[A-Za-z\. ]+) ({date_regex2}(?:.*))</h2>', torrent_description_page_h2_line)[0]
  104. except IndexError:
  105. logger.exception('Cannot find title from the JPS upload')
  106. raise
  107. elif self.category == "Misc":
  108. try:
  109. self.title = re.findall(r'\[Misc\] (?:[A-Za-z\, ]+) - (.+)</h2>', torrent_description_page_h2_line)[0]
  110. except IndexError:
  111. logger.exception('Cannot find title from the JPS upload')
  112. raise
  113. else:
  114. logger.exception('Cannot find title from the JPS upload')
  115. raise
  116. else:
  117. self.title = titlemerged
  118. logger.info(f'Title: {self.title}')
  119. try:
  120. originalchars = re.findall(r'<a href="artist.php\?id=(?:[0-9]+)">(.+)</a> - (.+)\)</h3>', str(originaltitleline))[0]
  121. self.originalartist = originalchars[0]
  122. self.originaltitle = originalchars[1]
  123. logger.info(f"Original artist: {self.originalartist} Original title: {self.originaltitle}")
  124. except IndexError: # Do nothing if group has no original artist/title
  125. pass
  126. self.rel2 = str(soup.select('#content .thin .main_column .torrent_table tbody')[0])
  127. # Get description with BB Code if user has group edit permissions on JPS, if not just use stripped html text.
  128. try:
  129. self.groupdescription = get_group_descrption_bbcode(self.groupid) # Requires PU+ at JPS
  130. except:
  131. logger.exception('Could not get group description BBCode. Are you a Power User+ at JPS?')
  132. self.groupdescription = remove_html_tags(str(soup.select('#content .thin .main_column .box .body')[0]))
  133. logger.info(f"Group description:\n{self.groupdescription}")
  134. image = str(soup.select('#content .thin .sidebar .box p a'))
  135. try:
  136. self.imagelink = "https://jpopsuki.eu/" + re.findall('<a\s+(?:[^>]*?\s+)?href=\"([^\"]*)\"', image)[0]
  137. logger.info(f'Image link: {self.imagelink}')
  138. except IndexError: # No image for the group
  139. self.imagelink = None
  140. tagsget = str(soup.select('#content .thin .sidebar .box ul.stats.nobullet li'))
  141. tags = re.findall('searchtags=([^\"]+)', tagsget)
  142. logger.info(f'Tags: {tags}')
  143. self.tagsall = ",".join(tags)
  144. try:
  145. contribartistsget = str(soup.select('#content .thin .sidebar .box .body ul.stats.nobullet li'))
  146. contribartistslist = re.findall(r'<li><a href="artist\.php\?id=(?:[0-9]+?)" title="([^"]*?)">([\w .-]+)</a>', contribartistsget)
  147. self.contribartists = {}
  148. for artistpair in contribartistslist:
  149. self.contribartists[artistpair[1]] = artistpair[0] # Creates contribartists[artist] = origartist
  150. logger.info(f'Contributing artists: {self.contribartists}')
  151. except IndexError: # Do nothing if group has no contrib artists
  152. pass
  153. def originalchars(self):
  154. return self.originalartist, self.originaltitle
  155. def __getattr__(self, item):
  156. return self.item
  157. def split_bad_multiple_artists(artists):
  158. return re.split(', | x | & ', artists)
  159. def get_release_data(torrentids, release_data, date):
  160. """
  161. Retrieve all torrent id and release data (slash separated data and upload date) whilst coping with 'noise' from FL torrents,
  162. and either return all data if using a group URL or only return the relevant releases if release url(s) were used
  163. :param torrentids: list of torrentids to be processed, NULL if group is used
  164. :return: releasedata: 2d dict of release data in the format of torrentid: { "slashdata" : [ slashdatalist ] , "uploaddate": uploaddate } .
  165. """
  166. freeleechtext = '<strong>Freeleech!</strong>'
  167. releasedatapre = re.findall(r"swapTorrent\('([0-9]+)'\);\">» (.*?)</a>.*?<blockquote>(?:\s*)Uploaded by <a href=\"user.php\?id=(?:[0-9]+)\">(?:[\S]+)</a> on <span title=\"(?:[^\"]+)\">([^<]+)</span>", release_data, re.DOTALL)
  168. # if args.parsed.debug:
  169. # print(f'Pre-processed releasedata: {json.dumps(releasedatapre, indent=2)}')
  170. releasedata = {}
  171. for release in releasedatapre:
  172. torrentid = release[0]
  173. slashlist = ([i.split(' / ') for i in [release[1]]])[0]
  174. uploadeddate = release[2]
  175. releasedata[torrentid] = {}
  176. releasedata[torrentid]['slashdata'] = slashlist
  177. releasedata[torrentid]['uploaddate'] = uploadeddate
  178. logger.debug(f'Entire group contains: {json.dumps(releasedata, indent=2)}')
  179. removetorrents = []
  180. for torrentid, release in releasedata.items(): # Now release is a dict!
  181. if len(torrentids) != 0 and torrentid not in torrentids:
  182. # If len(torrentids) != 0 then user has supplied a group url and every release is processed,
  183. # otherwise iterate through releasedata{} and remove what is not needed
  184. removetorrents.append(torrentid)
  185. if freeleechtext in release['slashdata']:
  186. release['slashdata'].remove(freeleechtext) # Remove Freeleech whole match so it does not interfere with Remastered
  187. for index, slashreleaseitem in enumerate(release['slashdata']):
  188. if remaster_freeleech_removed := re.findall(r'(.*) - <strong>Freeleech!<\/strong>', slashreleaseitem): # Handle Freeleech remastered torrents, issue #43
  189. release['slashdata'][index] = f'{remaster_freeleech_removed[0]} - {date[:4]}' # Use the extracted value and append group JPS release year
  190. logger.debug(f"Torrent {torrentid} is freeleech remastered, validated remasterdata to {release['slashdata'][index]}")
  191. for torrentid in removetorrents:
  192. del (releasedata[torrentid])
  193. logger.info(f'Selected for upload: {releasedata}')
  194. return releasedata
  195. def get_group_descrption_bbcode(groupid):
  196. """
  197. Retrieve original bbcode from edit group url and reformat any JPS style bbcode
  198. :param: groupid: JPS groupid to get group description with bbcode
  199. :return: bbcode: group description with bbcode
  200. """
  201. edit_group_page = jpopsuki(f"https://jpopsuki.eu/torrents.php?action=editgroup&groupid={groupid}")
  202. soup = BeautifulSoup(edit_group_page.text, 'html5lib')
  203. bbcode = soup.find("textarea", {"name": "body"}).string
  204. bbcode_sanitised = re.sub(r'\[youtube=([^\]]+)]', r'[youtube]\1[/youtube]', bbcode)
  205. return bbcode_sanitised
  206. def get_jps_user_id():
  207. """
  208. Returns the JPopSuki user id
  209. :return: int: user id
  210. """
  211. res = jpopsuki("https://jpopsuki.eu/", True)
  212. soup = BeautifulSoup(res.text, 'html5lib')
  213. href = soup.select('.username')[0]['href']
  214. jps_user_id = re.match(r"user\.php\?id=(\d+)", href).group(1)
  215. time.sleep(5) # Sleep as otherwise we hit JPS browse quota
  216. return int(str(jps_user_id))
  217. def get_user_keys():
  218. """
  219. Get SM session authkey and torrent_password_key for use by uploadtorrent()|download_sm_torrent() data dict.
  220. Uses SM login data
  221. """
  222. smpage = sugoimusic("https://sugoimusic.me/torrents.php?id=118", test_login=True) # Arbitrary page on JPS that has authkey
  223. soup = BeautifulSoup(smpage.text, 'html5lib')
  224. rel2 = str(soup.select_one('#torrent_details .group_torrent > td > span > .tooltip'))
  225. return {
  226. 'authkey': re.findall('authkey=(.*)&amp;torrent_pass=', rel2)[0],
  227. 'torrent_password_key': re.findall(r"torrent_pass=(.+)\" title", rel2)[0]
  228. }
  229. def get_torrent_link(torrentid, release_data):
  230. """
  231. Extract a torrent link for a given torrentid
  232. :param torrentid:
  233. :return: torrentlink: URI of torrent link
  234. """
  235. torrentlink = re.findall(rf'torrents\.php\?action=download&amp;id={torrentid}&amp;authkey=(?:[^&]+)&amp;torrent_pass=(?:[^"]+)', release_data)[0]
  236. return torrentlink