swingmusic/app/utils/parsers.py
mungai-njoroge 9972f64e8c configure to allow updating artist separators
+ remove ampersand from default
+ misc
2023-08-25 20:09:50 +03:00

214 lines
5.6 KiB
Python

import re
from app.enums.album_versions import AlbumVersionEnum
from app.settings import get_flag, ParserFlags
def split_artists(src: str):
"""
Splits a string of artists into a list of artists.
"""
separators: set = get_flag(ParserFlags.ARTIST_SEPARATORS)
separators = separators.union({","})
for sep in separators:
src = src.replace(sep, "߸")
artists = src.split("߸")
return [a.strip() for a in artists]
def parse_artist_from_filename(title: str):
"""
Extracts artist names from a song title using regex.
"""
regex = r"^(.+?)\s*[-–—]\s*(?:.+?)$"
match = re.search(regex, title, re.IGNORECASE)
if not match:
return []
artists = match.group(1)
artists = split_artists(artists)
return artists
def parse_title_from_filename(title: str):
"""
Extracts track title from a song title using regex.
"""
regex = r"^(?:.+?)\s*[-–—]\s*(.+?)$"
match = re.search(regex, title, re.IGNORECASE)
if not match:
return title
res = match.group(1)
# remove text in brackets starting with "official" case-insensitive
res = re.sub(r"\s*\([^)]*official[^)]*\)", "", res, flags=re.IGNORECASE)
return res.strip()
def remove_prod(title: str) -> str:
"""
Removes the producer string in a track title using regex.
"""
# check if title contain title, if not return it.
if not ("prod." in title.lower()):
return title
# check if title has brackets
if re.search(r"[()\[\]]", title):
regex = r"\s?(\(|\[)prod\..*?(\)|\])\s?"
else:
regex = r"\s?\bprod\.\s*\S+"
# remove the producer string
title = re.sub(regex, "", title, flags=re.IGNORECASE)
return title.strip()
def parse_feat_from_title(title: str) -> tuple[list[str], str]:
"""
Extracts featured artists from a song title using regex.
"""
regex = r"\((?:feat|ft|featuring|with)\.?\s+(.+?)\)"
# regex for square brackets 👇
sqr_regex = r"\[(?:feat|ft|featuring|with)\.?\s+(.+?)\]"
match = re.search(regex, title, re.IGNORECASE)
if not match:
match = re.search(sqr_regex, title, re.IGNORECASE)
regex = sqr_regex
if not match:
return [], title
artists = match.group(1)
artists = split_artists(artists)
# remove "feat" group from title
new_title = re.sub(regex, "", title, flags=re.IGNORECASE)
return artists, new_title
def get_base_album_title(string) -> tuple[str, str | None]:
"""
Extracts the base album title from a string.
"""
pattern = re.compile(
r"\s*(\(|\[)[^\)\]]*?(version|remaster|deluxe|edition|expanded|anniversary)[^\)\]]*?(\)|\])$",
re.IGNORECASE,
)
# TODO: Fix "Redundant character escape '\]' in RegExp "
match = pattern.search(string)
if match:
removed_block = match.group(0)
title = string.replace(removed_block, "")
return title.strip(), removed_block.strip()
return string, None
def get_anniversary(text: str) -> str | None:
"""
Extracts anniversary from text using regex.
"""
_end = "anniversary"
match = re.search(r"\b\d+\w*(?= anniversary)", text, re.IGNORECASE)
if match:
return match.group(0).strip().lower() + f" {_end}"
else:
return _end
def get_album_info(bracket_text: str | None) -> list[str]:
"""
Extracts album version info from the bracketed text on an album title string using regex.
"""
if not bracket_text:
return []
# replace all non-alphanumeric characters with an empty string
bracket_text = re.sub(r"[^a-zA-Z0-9\s]", "", bracket_text)
versions = []
for version_keywords in AlbumVersionEnum:
for keyword in version_keywords.value:
if re.search(keyword, bracket_text, re.IGNORECASE):
versions.append(version_keywords.name.lower())
break
if "anniversary" in versions:
anniversary = get_anniversary(bracket_text)
versions.insert(0, anniversary)
versions.remove("anniversary")
return versions
def get_base_title_and_versions(
original_album_title: str, get_versions=True
) -> tuple[str, list[str]]:
"""
Extracts the base album title and version info from an album title string using regex.
"""
album_title, version_block = get_base_album_title(original_album_title)
if version_block is None:
return original_album_title, []
if not get_versions:
return album_title, []
versions = get_album_info(version_block)
# if no version info could be extracted, accept defeat!
if len(versions) == 0:
album_title = original_album_title
return album_title, versions
def remove_bracketed_remaster(text: str):
"""
Removes remaster info from a track title that contains brackets using regex.
"""
return re.sub(
r"\s*[\\[(][^)\]]*remaster[^)\]]*[)\]]\s*", "", text, flags=re.IGNORECASE
).strip()
def remove_hyphen_remasters(text: str):
"""
Removes remaster info from a track title that contains a hypen (-) using regex.
"""
return re.sub(
r"\s-\s*[^-]*\bremaster[^-]*\s*", "", text, flags=re.IGNORECASE
).strip()
def clean_title(title: str) -> str:
"""
Removes remaster info from a track title using regex.
"""
if "remaster" not in title.lower():
return title
rem_1 = remove_bracketed_remaster(title)
rem_2 = remove_hyphen_remasters(title)
return rem_1 if len(rem_2) > len(rem_1) else rem_2
# if "[" in title or "(" in title:
# return remove_bracketed_remaster(title)
#
# if "-" in title:
# return remove_hyphen_remasters(title)