swingmusic/app/utils/parsers.py
geoffrey45 3d5ee855d8 add methods to extract base album title from an album title string
+ add methods to extract album versions
+ implement these methods in the album model class
2023-04-22 15:12:59 +03:00

135 lines
3.6 KiB
Python

import re
from enum import Enum
def split_artists(src: str, with_and: bool = False):
exp = r"\s*(?: and |&|,|;)\s*" if with_and else r"\s*[,;]\s*"
artists = re.split(exp, src)
return [a.strip() for a in artists]
def parse_artist_from_filename(title: str):
"""
Extracts artist names from a song title using regex.
"""
regex = r"^(.+?)\s*[-–—]\s*(?:.+?)$"
match = re.search(regex, title, re.IGNORECASE)
if not match:
return []
artists = match.group(1)
artists = split_artists(artists)
return artists
def parse_title_from_filename(title: str):
"""
Extracts track title from a song title using regex.
"""
regex = r"^(?:.+?)\s*[-–—]\s*(.+?)$"
match = re.search(regex, title, re.IGNORECASE)
if not match:
return title
res = match.group(1)
# remove text in brackets starting with "official" case-insensitive
res = re.sub(r"\s*\([^)]*official[^)]*\)", "", res, flags=re.IGNORECASE)
return res.strip()
def remove_prod(title: str) -> str:
"""
Removes the producer string in a track title using regex.
"""
# check if title contain title, if not return it.
if not ("prod." in title.lower()):
return title
# check if title has brackets
if re.search(r"[()\[\]]", title):
regex = r"\s?(\(|\[)prod\..*?(\)|\])\s?"
else:
regex = r"\s?\bprod\.\s*\S+"
# remove the producer string
title = re.sub(regex, "", title, flags=re.IGNORECASE)
return title.strip()
def parse_feat_from_title(title: str) -> tuple[list[str], str]:
"""
Extracts featured artists from a song title using regex.
"""
regex = r"\((?:feat|ft|featuring|with)\.?\s+(.+?)\)"
# regex for square brackets 👇
sqr_regex = r"\[(?:feat|ft|featuring|with)\.?\s+(.+?)\]"
match = re.search(regex, title, re.IGNORECASE)
if not match:
match = re.search(sqr_regex, title, re.IGNORECASE)
regex = sqr_regex
if not match:
return [], title
artists = match.group(1)
artists = split_artists(artists, with_and=True)
# remove "feat" group from title
new_title = re.sub(regex, "", title, flags=re.IGNORECASE)
return artists, new_title
def get_base_album_title(string):
pattern = re.compile(r'\s*(\(|\[).*?(version|remaster|deluxe|edition|expanded).*?(\)|\])', re.IGNORECASE)
match = pattern.search(string)
if match:
removed_block = match.group(0)
title = string.replace(removed_block, '').strip('()[] ')
return title, removed_block
return string, None
class AlbumVersionEnum(Enum):
REMASTER = ("remaster", "remastered")
DELUXE = ("deluxe",)
EXPANDED = ("expanded",)
SUPER_DELUXE = ("super deluxe",)
EXTENDED = ("extended",)
BONUS_TRACK = ("bonus track", "bonus tracks")
RE_RECORD = ("re-recorded", "rerecorded")
INTL_VERSION = ("international",)
ORIGINAL = ("original",)
RE_MIX = ("re-mix",)
RE_RECORDED = ("re-recorded", "rerecorded")
REISSUE = ("reissue",)
def get_album_info(bracket_text: str | None) -> list[str]:
if not bracket_text:
return []
versions = []
for version_keywords in AlbumVersionEnum:
for keyword in version_keywords.value:
if re.search(keyword, bracket_text, re.IGNORECASE):
versions.append(version_keywords.name.lower())
break
return versions
def get_base_title_and_versions(album: str) -> tuple[str, list[str]]:
album_title, version_block = get_base_album_title(album)
versions = get_album_info(version_block)
return album_title, versions