Reduce silent-query noise in training and open-dataset preparation
Constraint: Real music queries often include long silence heads/tails, but the pipeline still needs random-crop generalization and simple CLI controls Rejected: Replace all random crops with structure-aware segmentation | would overfit to curated boundaries and diverge from messy real-world query distributions Confidence: high Scope-risk: moderate Directive: Keep random as fallback; layer beat/onset/chorus-aware segmentation on top instead of removing silence-aware and sliding paths Tested: /usr/local/miniconda3/bin/python -m py_compile acr-engine/src/data/dataset.py acr-engine/src/data/manifest_tools.py acr-engine/train.py acr-engine/src/data/external_adapters.py; external_adapters.py prepare-local fma /tmp/segtest_audio --query-strategy silence_aware; train.py --data data/synthetic_v2 --dry-run --segment-strategy hybrid Not-tested: Full FMA smoke retraining/eval with the new segmentation strategies
Showing
6 changed files
with
261 additions
and
7 deletions
| ... | @@ -23,6 +23,8 @@ class ACRDataset(Dataset): | ... | @@ -23,6 +23,8 @@ class ACRDataset(Dataset): |
| 23 | n_crops_per_song: int = 4, | 23 | n_crops_per_song: int = 4, |
| 24 | song_to_idx: Optional[Dict[str, int]] = None, | 24 | song_to_idx: Optional[Dict[str, int]] = None, |
| 25 | references_only: bool = False, | 25 | references_only: bool = False, |
| 26 | segment_strategy: str = "random", | ||
| 27 | silence_top_db: int = 30, | ||
| 26 | ): | 28 | ): |
| 27 | self.sr = sr | 29 | self.sr = sr |
| 28 | self.n_mels = n_mels | 30 | self.n_mels = n_mels |
| ... | @@ -31,6 +33,8 @@ class ACRDataset(Dataset): | ... | @@ -31,6 +33,8 @@ class ACRDataset(Dataset): |
| 31 | self.segment_len = int(segment_dur * sr) | 33 | self.segment_len = int(segment_dur * sr) |
| 32 | self.augment = augment | 34 | self.augment = augment |
| 33 | self.n_crops = n_crops_per_song | 35 | self.n_crops = n_crops_per_song |
| 36 | self.segment_strategy = segment_strategy | ||
| 37 | self.silence_top_db = silence_top_db | ||
| 34 | self.data_dir = Path(data_dir) | 38 | self.data_dir = Path(data_dir) |
| 35 | self.asset_root = self.data_dir.parent if self.data_dir.name == "manifests" else self.data_dir | 39 | self.asset_root = self.data_dir.parent if self.data_dir.name == "manifests" else self.data_dir |
| 36 | 40 | ||
| ... | @@ -70,13 +74,52 @@ class ACRDataset(Dataset): | ... | @@ -70,13 +74,52 @@ class ACRDataset(Dataset): |
| 70 | ) | 74 | ) |
| 71 | return librosa.power_to_db(mel, ref=np.max) | 75 | return librosa.power_to_db(mel, ref=np.max) |
| 72 | 76 | ||
| 77 | def _find_non_silent_intervals(self, y: np.ndarray) -> List[tuple[int, int]]: | ||
| 78 | intervals = librosa.effects.split(y, top_db=self.silence_top_db) | ||
| 79 | if intervals is None or len(intervals) == 0: | ||
| 80 | return [(0, len(y))] | ||
| 81 | return [(int(s), int(e)) for s, e in intervals] | ||
| 82 | |||
| 83 | def _choose_offset(self, sample: Dict, audio_path: Path) -> float: | ||
| 84 | duration = float(sample["duration"]) | ||
| 85 | max_offset = max(0.0, duration - 5.0) | ||
| 86 | if max_offset <= 0: | ||
| 87 | return 0.0 | ||
| 88 | |||
| 89 | if self.segment_strategy == "random": | ||
| 90 | return random.uniform(0, max_offset) | ||
| 91 | |||
| 92 | y, _ = librosa.load(str(audio_path), sr=self.sr, mono=True) | ||
| 93 | target_len = self.segment_len | ||
| 94 | intervals = self._find_non_silent_intervals(y) | ||
| 95 | valid_intervals = [] | ||
| 96 | for start, end in intervals: | ||
| 97 | if end - start >= target_len: | ||
| 98 | valid_intervals.append((start, end)) | ||
| 99 | |||
| 100 | if self.segment_strategy == "silence_aware": | ||
| 101 | if valid_intervals: | ||
| 102 | start, end = random.choice(valid_intervals) | ||
| 103 | seg_max_start = max(start, end - target_len) | ||
| 104 | chosen = random.randint(start, seg_max_start) if seg_max_start > start else start | ||
| 105 | return min(chosen / self.sr, max_offset) | ||
| 106 | return random.uniform(0, max_offset) | ||
| 107 | |||
| 108 | if self.segment_strategy == "hybrid": | ||
| 109 | if valid_intervals and random.random() < 0.7: | ||
| 110 | start, end = random.choice(valid_intervals) | ||
| 111 | seg_max_start = max(start, end - target_len) | ||
| 112 | chosen = random.randint(start, seg_max_start) if seg_max_start > start else start | ||
| 113 | return min(chosen / self.sr, max_offset) | ||
| 114 | return random.uniform(0, max_offset) | ||
| 115 | |||
| 116 | return random.uniform(0, max_offset) | ||
| 117 | |||
| 73 | def __getitem__(self, idx): | 118 | def __getitem__(self, idx): |
| 74 | sample = self.samples[idx // self.n_crops] | 119 | sample = self.samples[idx // self.n_crops] |
| 75 | duration = sample["duration"] | ||
| 76 | max_offset = max(0, duration - 5.0) | ||
| 77 | offset = random.uniform(0, max_offset) if max_offset > 0 else 0 | ||
| 78 | 120 | ||
| 79 | audio_path = self.asset_root / sample["audio_path"] | 121 | audio_path = self.asset_root / sample["audio_path"] |
| 122 | offset = self._choose_offset(sample, audio_path) | ||
| 80 | y = self._load_segment(str(audio_path), offset, 5.0) | 123 | y = self._load_segment(str(audio_path), offset, 5.0) |
| 81 | 124 | ||
| 82 | if self.augment and sample.get("type") != "reference": | 125 | if self.augment and sample.get("type") != "reference": |
| ... | @@ -172,6 +215,8 @@ class SongPairDataset(Dataset): | ... | @@ -172,6 +215,8 @@ class SongPairDataset(Dataset): |
| 172 | hop_length: int = 160, | 215 | hop_length: int = 160, |
| 173 | segment_dur: float = 5.0, | 216 | segment_dur: float = 5.0, |
| 174 | augment: bool = True, | 217 | augment: bool = True, |
| 218 | segment_strategy: str = "random", | ||
| 219 | silence_top_db: int = 30, | ||
| 175 | ): | 220 | ): |
| 176 | self.sr = sr | 221 | self.sr = sr |
| 177 | self.n_mels = n_mels | 222 | self.n_mels = n_mels |
| ... | @@ -179,6 +224,8 @@ class SongPairDataset(Dataset): | ... | @@ -179,6 +224,8 @@ class SongPairDataset(Dataset): |
| 179 | self.hop_length = hop_length | 224 | self.hop_length = hop_length |
| 180 | self.segment_len = int(segment_dur * sr) | 225 | self.segment_len = int(segment_dur * sr) |
| 181 | self.augment = augment | 226 | self.augment = augment |
| 227 | self.segment_strategy = segment_strategy | ||
| 228 | self.silence_top_db = silence_top_db | ||
| 182 | self.data_dir = Path(data_dir) | 229 | self.data_dir = Path(data_dir) |
| 183 | self.asset_root = self.data_dir.parent if self.data_dir.name == "manifests" else self.data_dir | 230 | self.asset_root = self.data_dir.parent if self.data_dir.name == "manifests" else self.data_dir |
| 184 | 231 | ||
| ... | @@ -211,11 +258,32 @@ class SongPairDataset(Dataset): | ... | @@ -211,11 +258,32 @@ class SongPairDataset(Dataset): |
| 211 | 258 | ||
| 212 | def _load_clip(self, sample: Dict) -> np.ndarray: | 259 | def _load_clip(self, sample: Dict) -> np.ndarray: |
| 213 | path = self.asset_root / sample["audio_path"] | 260 | path = self.asset_root / sample["audio_path"] |
| 214 | y, _ = librosa.load(str(path), sr=self.sr, mono=True, duration=5.0) | 261 | full_y, _ = librosa.load(str(path), sr=self.sr, mono=True) |
| 262 | duration = float(sample.get("duration", len(full_y) / self.sr)) | ||
| 263 | max_offset = max(0.0, duration - 5.0) | ||
| 264 | offset = 0.0 | ||
| 265 | if max_offset > 0: | ||
| 266 | if self.segment_strategy == "random": | ||
| 267 | offset = random.uniform(0, max_offset) | ||
| 268 | else: | ||
| 269 | intervals = librosa.effects.split(full_y, top_db=self.silence_top_db) | ||
| 270 | valid = [(int(s), int(e)) for s, e in intervals if int(e) - int(s) >= self.segment_len] if len(intervals) else [] | ||
| 271 | if self.segment_strategy == "silence_aware" and valid: | ||
| 272 | start, end = random.choice(valid) | ||
| 273 | seg_max_start = max(start, end - self.segment_len) | ||
| 274 | chosen = random.randint(start, seg_max_start) if seg_max_start > start else start | ||
| 275 | offset = min(chosen / self.sr, max_offset) | ||
| 276 | elif self.segment_strategy == "hybrid" and valid and random.random() < 0.7: | ||
| 277 | start, end = random.choice(valid) | ||
| 278 | seg_max_start = max(start, end - self.segment_len) | ||
| 279 | chosen = random.randint(start, seg_max_start) if seg_max_start > start else start | ||
| 280 | offset = min(chosen / self.sr, max_offset) | ||
| 281 | else: | ||
| 282 | offset = random.uniform(0, max_offset) | ||
| 283 | start = int(offset * self.sr) | ||
| 284 | y = full_y[start : start + self.segment_len] | ||
| 215 | if len(y) < self.segment_len: | 285 | if len(y) < self.segment_len: |
| 216 | y = np.pad(y, (0, self.segment_len - len(y))) | 286 | y = np.pad(y, (0, self.segment_len - len(y))) |
| 217 | else: | ||
| 218 | y = y[: self.segment_len] | ||
| 219 | return y | 287 | return y |
| 220 | 288 | ||
| 221 | def _to_mel(self, y: np.ndarray) -> torch.Tensor: | 289 | def _to_mel(self, y: np.ndarray) -> torch.Tensor: | ... | ... |
| ... | @@ -104,6 +104,8 @@ class BaseAdapter: | ... | @@ -104,6 +104,8 @@ class BaseAdapter: |
| 104 | eval_ratio: float = 0.2, | 104 | eval_ratio: float = 0.2, |
| 105 | query_duration: float = 8.0, | 105 | query_duration: float = 8.0, |
| 106 | query_stride: float | None = None, | 106 | query_stride: float | None = None, |
| 107 | query_strategy: str = "random", | ||
| 108 | silence_top_db: int = 30, | ||
| 107 | seed: int = 42, | 109 | seed: int = 42, |
| 108 | ) -> Dict: | 110 | ) -> Dict: |
| 109 | output_root.mkdir(parents=True, exist_ok=True) | 111 | output_root.mkdir(parents=True, exist_ok=True) |
| ... | @@ -126,6 +128,12 @@ class BaseAdapter: | ... | @@ -126,6 +128,12 @@ class BaseAdapter: |
| 126 | str(query_stride), | 128 | str(query_stride), |
| 127 | ]) | 129 | ]) |
| 128 | cmd.extend([ | 130 | cmd.extend([ |
| 131 | "--query-strategy", | ||
| 132 | str(query_strategy), | ||
| 133 | "--silence-top-db", | ||
| 134 | str(silence_top_db), | ||
| 135 | ]) | ||
| 136 | cmd.extend([ | ||
| 129 | "--seed", | 137 | "--seed", |
| 130 | str(seed), | 138 | str(seed), |
| 131 | ]) | 139 | ]) |
| ... | @@ -361,6 +369,9 @@ def smoke_local_dataset( | ... | @@ -361,6 +369,9 @@ def smoke_local_dataset( |
| 361 | eval_ratio: float, | 369 | eval_ratio: float, |
| 362 | query_duration: float, | 370 | query_duration: float, |
| 363 | query_stride: float | None, | 371 | query_stride: float | None, |
| 372 | query_strategy: str, | ||
| 373 | segment_strategy: str, | ||
| 374 | silence_top_db: int, | ||
| 364 | seed: int, | 375 | seed: int, |
| 365 | train_epochs: int, | 376 | train_epochs: int, |
| 366 | batch_size: int, | 377 | batch_size: int, |
| ... | @@ -388,6 +399,8 @@ def smoke_local_dataset( | ... | @@ -388,6 +399,8 @@ def smoke_local_dataset( |
| 388 | eval_ratio=eval_ratio, | 399 | eval_ratio=eval_ratio, |
| 389 | query_duration=query_duration, | 400 | query_duration=query_duration, |
| 390 | query_stride=query_stride, | 401 | query_stride=query_stride, |
| 402 | query_strategy=query_strategy, | ||
| 403 | silence_top_db=silence_top_db, | ||
| 391 | seed=seed, | 404 | seed=seed, |
| 392 | ) | 405 | ) |
| 393 | manifests_dir = Path(prepare_summary["output_dir"]) | 406 | manifests_dir = Path(prepare_summary["output_dir"]) |
| ... | @@ -407,6 +420,8 @@ def smoke_local_dataset( | ... | @@ -407,6 +420,8 @@ def smoke_local_dataset( |
| 407 | "--device", resolved_device, | 420 | "--device", resolved_device, |
| 408 | "--epochs", str(train_epochs), | 421 | "--epochs", str(train_epochs), |
| 409 | "--batch-size", str(batch_size), | 422 | "--batch-size", str(batch_size), |
| 423 | "--segment-strategy", str(segment_strategy), | ||
| 424 | "--silence-top-db", str(silence_top_db), | ||
| 410 | ], check=True) | 425 | ], check=True) |
| 411 | 426 | ||
| 412 | subprocess.run([ | 427 | subprocess.run([ |
| ... | @@ -444,6 +459,9 @@ def smoke_local_dataset( | ... | @@ -444,6 +459,9 @@ def smoke_local_dataset( |
| 444 | base_cfg=base_cfg, | 459 | base_cfg=base_cfg, |
| 445 | ) | 460 | ) |
| 446 | config["data"]["manifest_query_stride"] = query_stride | 461 | config["data"]["manifest_query_stride"] = query_stride |
| 462 | config["data"]["manifest_query_strategy"] = query_strategy | ||
| 463 | config["data"]["silence_top_db"] = silence_top_db | ||
| 464 | config["run"]["train_segment_strategy"] = segment_strategy | ||
| 447 | report_dir.mkdir(parents=True, exist_ok=True) | 465 | report_dir.mkdir(parents=True, exist_ok=True) |
| 448 | config_path.write_text(json.dumps(config, indent=2)) | 466 | config_path.write_text(json.dumps(config, indent=2)) |
| 449 | 467 | ||
| ... | @@ -493,6 +511,8 @@ def main(): | ... | @@ -493,6 +511,8 @@ def main(): |
| 493 | p.add_argument("--eval-ratio", type=float, default=0.2) | 511 | p.add_argument("--eval-ratio", type=float, default=0.2) |
| 494 | p.add_argument("--query-duration", type=float, default=8.0) | 512 | p.add_argument("--query-duration", type=float, default=8.0) |
| 495 | p.add_argument("--query-stride", type=float, default=None) | 513 | p.add_argument("--query-stride", type=float, default=None) |
| 514 | p.add_argument("--query-strategy", choices=["random", "sliding", "silence_aware", "hybrid"], default="random") | ||
| 515 | p.add_argument("--silence-top-db", type=int, default=30) | ||
| 496 | p.add_argument("--seed", type=int, default=42) | 516 | p.add_argument("--seed", type=int, default=42) |
| 497 | 517 | ||
| 498 | p = sub.add_parser("inspect-local") | 518 | p = sub.add_parser("inspect-local") |
| ... | @@ -523,6 +543,9 @@ def main(): | ... | @@ -523,6 +543,9 @@ def main(): |
| 523 | p.add_argument("--eval-ratio", type=float, default=0.2) | 543 | p.add_argument("--eval-ratio", type=float, default=0.2) |
| 524 | p.add_argument("--query-duration", type=float, default=8.0) | 544 | p.add_argument("--query-duration", type=float, default=8.0) |
| 525 | p.add_argument("--query-stride", type=float, default=None) | 545 | p.add_argument("--query-stride", type=float, default=None) |
| 546 | p.add_argument("--query-strategy", choices=["random", "sliding", "silence_aware", "hybrid"], default="random") | ||
| 547 | p.add_argument("--segment-strategy", choices=["random", "silence_aware", "hybrid"], default="random") | ||
| 548 | p.add_argument("--silence-top-db", type=int, default=30) | ||
| 526 | p.add_argument("--seed", type=int, default=42) | 549 | p.add_argument("--seed", type=int, default=42) |
| 527 | p.add_argument("--train-epochs", type=int, default=1) | 550 | p.add_argument("--train-epochs", type=int, default=1) |
| 528 | p.add_argument("--batch-size", type=int, default=2) | 551 | p.add_argument("--batch-size", type=int, default=2) |
| ... | @@ -545,6 +568,8 @@ def main(): | ... | @@ -545,6 +568,8 @@ def main(): |
| 545 | eval_ratio=args.eval_ratio, | 568 | eval_ratio=args.eval_ratio, |
| 546 | query_duration=args.query_duration, | 569 | query_duration=args.query_duration, |
| 547 | query_stride=args.query_stride, | 570 | query_stride=args.query_stride, |
| 571 | query_strategy=args.query_strategy, | ||
| 572 | silence_top_db=args.silence_top_db, | ||
| 548 | seed=args.seed, | 573 | seed=args.seed, |
| 549 | ) | 574 | ) |
| 550 | print(json.dumps(summary, indent=2, ensure_ascii=False)) | 575 | print(json.dumps(summary, indent=2, ensure_ascii=False)) |
| ... | @@ -577,6 +602,9 @@ def main(): | ... | @@ -577,6 +602,9 @@ def main(): |
| 577 | eval_ratio=args.eval_ratio, | 602 | eval_ratio=args.eval_ratio, |
| 578 | query_duration=args.query_duration, | 603 | query_duration=args.query_duration, |
| 579 | query_stride=args.query_stride, | 604 | query_stride=args.query_stride, |
| 605 | query_strategy=args.query_strategy, | ||
| 606 | segment_strategy=args.segment_strategy, | ||
| 607 | silence_top_db=args.silence_top_db, | ||
| 580 | seed=args.seed, | 608 | seed=args.seed, |
| 581 | train_epochs=args.train_epochs, | 609 | train_epochs=args.train_epochs, |
| 582 | batch_size=args.batch_size, | 610 | batch_size=args.batch_size, | ... | ... |
| ... | @@ -11,6 +11,7 @@ from pathlib import Path | ... | @@ -11,6 +11,7 @@ from pathlib import Path |
| 11 | from typing import List, Dict | 11 | from typing import List, Dict |
| 12 | import numpy as np | 12 | import numpy as np |
| 13 | import soundfile as sf | 13 | import soundfile as sf |
| 14 | import librosa | ||
| 14 | 15 | ||
| 15 | 16 | ||
| 16 | def write_catalog(records: List[Dict], output_path: Path): | 17 | def write_catalog(records: List[Dict], output_path: Path): |
| ... | @@ -45,6 +46,8 @@ def build_train_eval_from_audio_dir( | ... | @@ -45,6 +46,8 @@ def build_train_eval_from_audio_dir( |
| 45 | eval_ratio: float = 0.2, | 46 | eval_ratio: float = 0.2, |
| 46 | query_duration: float = 8.0, | 47 | query_duration: float = 8.0, |
| 47 | query_stride: float | None = None, | 48 | query_stride: float | None = None, |
| 49 | query_strategy: str = "random", | ||
| 50 | silence_top_db: int = 30, | ||
| 48 | seed: int = 42, | 51 | seed: int = 42, |
| 49 | ): | 52 | ): |
| 50 | rng = random.Random(seed) | 53 | rng = random.Random(seed) |
| ... | @@ -59,6 +62,38 @@ def build_train_eval_from_audio_dir( | ... | @@ -59,6 +62,38 @@ def build_train_eval_from_audio_dir( |
| 59 | train = [] | 62 | train = [] |
| 60 | test = [] | 63 | test = [] |
| 61 | 64 | ||
| 65 | def compute_silence_aware_offsets(path: Path, duration: float) -> List[float]: | ||
| 66 | if duration < query_duration: | ||
| 67 | return [] | ||
| 68 | try: | ||
| 69 | y, sr = librosa.load(str(path), sr=None, mono=True) | ||
| 70 | intervals = librosa.effects.split(y, top_db=silence_top_db) | ||
| 71 | if intervals is None or len(intervals) == 0: | ||
| 72 | raise ValueError("no_non_silent_intervals") | ||
| 73 | offsets = [] | ||
| 74 | target_len = int(query_duration * sr) | ||
| 75 | for start, end in intervals: | ||
| 76 | start = int(start) | ||
| 77 | end = int(end) | ||
| 78 | if end - start < target_len: | ||
| 79 | continue | ||
| 80 | if query_stride and query_stride > 0: | ||
| 81 | stride = int(query_stride * sr) | ||
| 82 | local_positions = list(range(start, max(start + 1, end - target_len + 1), stride)) | ||
| 83 | if not local_positions: | ||
| 84 | local_positions = [start] | ||
| 85 | last_pos = end - target_len | ||
| 86 | if last_pos >= start and local_positions[-1] != last_pos: | ||
| 87 | local_positions.append(last_pos) | ||
| 88 | offsets.extend([round(pos / sr, 3) for pos in local_positions]) | ||
| 89 | else: | ||
| 90 | seg_max_start = max(start, end - target_len) | ||
| 91 | chosen = rng.randint(start, seg_max_start) if seg_max_start > start else start | ||
| 92 | offsets.append(round(chosen / sr, 3)) | ||
| 93 | return sorted(set(x for x in offsets if x <= max(0.0, duration - query_duration))) | ||
| 94 | except Exception: | ||
| 95 | return [] | ||
| 96 | |||
| 62 | for idx, path in enumerate(files): | 97 | for idx, path in enumerate(files): |
| 63 | target_name = f"{source_dataset}_{idx:05d}{path.suffix.lower()}" | 98 | target_name = f"{source_dataset}_{idx:05d}{path.suffix.lower()}" |
| 64 | target_path = audio_out_dir / target_name | 99 | target_path = audio_out_dir / target_name |
| ... | @@ -82,7 +117,21 @@ def build_train_eval_from_audio_dir( | ... | @@ -82,7 +117,21 @@ def build_train_eval_from_audio_dir( |
| 82 | refs.append(ref) | 117 | refs.append(ref) |
| 83 | 118 | ||
| 84 | if duration >= query_duration: | 119 | if duration >= query_duration: |
| 85 | if query_stride and query_stride > 0: | 120 | if query_strategy in {"silence_aware", "hybrid"}: |
| 121 | silence_offsets = compute_silence_aware_offsets(path, duration) | ||
| 122 | else: | ||
| 123 | silence_offsets = [] | ||
| 124 | |||
| 125 | if query_strategy == "silence_aware" and silence_offsets: | ||
| 126 | offsets = silence_offsets | ||
| 127 | elif query_strategy == "hybrid" and silence_offsets: | ||
| 128 | if query_stride and query_stride > 0: | ||
| 129 | offsets = silence_offsets | ||
| 130 | else: | ||
| 131 | max_offset = max(0.0, duration - query_duration) | ||
| 132 | random_offset = round(rng.uniform(0.0, max_offset) if max_offset > 0 else 0.0, 3) | ||
| 133 | offsets = sorted(set(silence_offsets + [random_offset])) | ||
| 134 | elif query_stride and query_stride > 0: | ||
| 86 | max_offset = max(0.0, duration - query_duration) | 135 | max_offset = max(0.0, duration - query_duration) |
| 87 | offsets = [round(x, 3) for x in np.arange(0.0, max_offset + 1e-9, query_stride).tolist()] | 136 | offsets = [round(x, 3) for x in np.arange(0.0, max_offset + 1e-9, query_stride).tolist()] |
| 88 | if not offsets: | 137 | if not offsets: |
| ... | @@ -124,6 +173,7 @@ def build_train_eval_from_audio_dir( | ... | @@ -124,6 +173,7 @@ def build_train_eval_from_audio_dir( |
| 124 | "test_queries": len(test), | 173 | "test_queries": len(test), |
| 125 | "query_duration": query_duration, | 174 | "query_duration": query_duration, |
| 126 | "query_stride": query_stride, | 175 | "query_stride": query_stride, |
| 176 | "query_strategy": query_strategy, | ||
| 127 | "output_dir": str(manifests_dir), | 177 | "output_dir": str(manifests_dir), |
| 128 | } | 178 | } |
| 129 | 179 | ||
| ... | @@ -225,6 +275,8 @@ def main(): | ... | @@ -225,6 +275,8 @@ def main(): |
| 225 | p.add_argument("--eval-ratio", type=float, default=0.2) | 275 | p.add_argument("--eval-ratio", type=float, default=0.2) |
| 226 | p.add_argument("--query-duration", type=float, default=8.0) | 276 | p.add_argument("--query-duration", type=float, default=8.0) |
| 227 | p.add_argument("--query-stride", type=float, default=None) | 277 | p.add_argument("--query-stride", type=float, default=None) |
| 278 | p.add_argument("--query-strategy", choices=["random", "sliding", "silence_aware", "hybrid"], default="random") | ||
| 279 | p.add_argument("--silence-top-db", type=int, default=30) | ||
| 228 | p.add_argument("--seed", type=int, default=42) | 280 | p.add_argument("--seed", type=int, default=42) |
| 229 | 281 | ||
| 230 | p = sub.add_parser("inspect-audio-dir") | 282 | p = sub.add_parser("inspect-audio-dir") |
| ... | @@ -247,6 +299,8 @@ def main(): | ... | @@ -247,6 +299,8 @@ def main(): |
| 247 | eval_ratio=args.eval_ratio, | 299 | eval_ratio=args.eval_ratio, |
| 248 | query_duration=args.query_duration, | 300 | query_duration=args.query_duration, |
| 249 | query_stride=args.query_stride, | 301 | query_stride=args.query_stride, |
| 302 | query_strategy=args.query_strategy, | ||
| 303 | silence_top_db=args.silence_top_db, | ||
| 250 | seed=args.seed, | 304 | seed=args.seed, |
| 251 | ) | 305 | ) |
| 252 | print(json.dumps({"status": "ok", **summary}, ensure_ascii=False)) | 306 | print(json.dumps({"status": "ok", **summary}, ensure_ascii=False)) | ... | ... |
| ... | @@ -125,6 +125,8 @@ def main(): | ... | @@ -125,6 +125,8 @@ def main(): |
| 125 | parser.add_argument("--epochs", type=int, default=None) | 125 | parser.add_argument("--epochs", type=int, default=None) |
| 126 | parser.add_argument("--batch-size", type=int, default=None) | 126 | parser.add_argument("--batch-size", type=int, default=None) |
| 127 | parser.add_argument("--lr", type=float, default=None) | 127 | parser.add_argument("--lr", type=float, default=None) |
| 128 | parser.add_argument("--segment-strategy", choices=["random", "silence_aware", "hybrid"], default="random") | ||
| 129 | parser.add_argument("--silence-top-db", type=int, default=30) | ||
| 128 | parser.add_argument("--dry-run", action="store_true") | 130 | parser.add_argument("--dry-run", action="store_true") |
| 129 | args = parser.parse_args() | 131 | args = parser.parse_args() |
| 130 | 132 | ||
| ... | @@ -153,6 +155,8 @@ def main(): | ... | @@ -153,6 +155,8 @@ def main(): |
| 153 | hop_length=cfg["data"]["hop_length"], | 155 | hop_length=cfg["data"]["hop_length"], |
| 154 | segment_dur=cfg["data"]["segment_dur"], | 156 | segment_dur=cfg["data"]["segment_dur"], |
| 155 | augment=True, | 157 | augment=True, |
| 158 | segment_strategy=args.segment_strategy, | ||
| 159 | silence_top_db=args.silence_top_db, | ||
| 156 | ) | 160 | ) |
| 157 | 161 | ||
| 158 | catalog_dataset = ACRDataset( | 162 | catalog_dataset = ACRDataset( |
| ... | @@ -166,6 +170,8 @@ def main(): | ... | @@ -166,6 +170,8 @@ def main(): |
| 166 | augment=False, | 170 | augment=False, |
| 167 | n_crops_per_song=1, | 171 | n_crops_per_song=1, |
| 168 | song_to_idx=train_dataset.song_to_idx, | 172 | song_to_idx=train_dataset.song_to_idx, |
| 173 | segment_strategy=args.segment_strategy, | ||
| 174 | silence_top_db=args.silence_top_db, | ||
| 169 | ) | 175 | ) |
| 170 | 176 | ||
| 171 | train_loader = DataLoader( | 177 | train_loader = DataLoader( | ... | ... |
| ... | @@ -5398,3 +5398,46 @@ | ... | @@ -5398,3 +5398,46 @@ |
| 5398 | - **人工标 offset 的短视频片段**:保持单条 query | 5398 | - **人工标 offset 的短视频片段**:保持单条 query |
| 5399 | - **只有整首音频、没有 query 起点的素材**:自动生成多窗口 query | 5399 | - **只有整首音频、没有 query 起点的素材**:自动生成多窗口 query |
| 5400 | - 这让 `7/8/16/18` 这类 query 型素材可以更直接进入训练与评测流水线,同时保留对 `pgvector` 入库的可追踪性 | 5400 | - 这让 `7/8/16/18` 这类 query 型素材可以更直接进入训练与评测流水线,同时保留对 `pgvector` 入库的可追踪性 |
| 5401 | |||
| 5402 | ### Stage: silence-aware segmentation for training and open-dataset query generation | ||
| 5403 | |||
| 5404 | 完成项: | ||
| 5405 | - 在 `acr-engine/src/data/dataset.py` 为训练切片新增: | ||
| 5406 | - `segment_strategy=random|silence_aware|hybrid` | ||
| 5407 | - `silence_top_db` | ||
| 5408 | - 接入 `librosa.effects.split`,用于优先选择非静音区作为训练片段来源 | ||
| 5409 | - 在 `acr-engine/src/data/manifest_tools.py` 为外部数据 query 生成新增: | ||
| 5410 | - `--query-strategy random|sliding|silence_aware|hybrid` | ||
| 5411 | - `--silence-top-db` | ||
| 5412 | - 在 `acr-engine/train.py` 暴露训练 CLI 参数: | ||
| 5413 | - `--segment-strategy` | ||
| 5414 | - `--silence-top-db` | ||
| 5415 | - 在 `acr-engine/src/data/external_adapters.py` 接通 `prepare-local` / `smoke-local` 的策略透传与配置落盘 | ||
| 5416 | - 在 [docs/training-data-and-pgvector-guide.md](./training-data-and-pgvector-guide.md) 补充“切片策略”章节 | ||
| 5417 | |||
| 5418 | 验证结果: | ||
| 5419 | - 代码编译验证: | ||
| 5420 | - `/usr/local/miniconda3/bin/python -m py_compile src/data/dataset.py src/data/manifest_tools.py train.py src/data/external_adapters.py` | ||
| 5421 | - 人造音频验证: | ||
| 5422 | - 构造 `4s silence + 10s tone + 4s silence` | ||
| 5423 | - `manifest_tools.py --query-strategy silence_aware --query-duration 5 --query-stride 2.5` | ||
| 5424 | - 导出 query offset:`3.968, 8.968, 9.08` | ||
| 5425 | - 说明 query 已明显偏向非静音主体区 | ||
| 5426 | - 训练侧偏移验证: | ||
| 5427 | - `random` offset 样本:`0.325, 1.13, 2.902, 3.575, 8.313, 8.797, 9.574, 11.598` | ||
| 5428 | - `silence_aware` offset 样本:`4.173, 4.228, 4.736, 5.111, 5.874, 5.974, 8.436, 8.805` | ||
| 5429 | - 说明 silence-aware 显著减少落入头尾静音区的概率 | ||
| 5430 | - dry-run 验证: | ||
| 5431 | - `train.py --data data/synthetic_v2 --dry-run --segment-strategy silence_aware` | ||
| 5432 | - forward/backward 成功,`Embedding shape: torch.Size([64, 192])` | ||
| 5433 | - adapter 验证: | ||
| 5434 | - `external_adapters.py prepare-local ... --query-strategy silence_aware` | ||
| 5435 | - summary 已记录 `query_strategy: silence_aware` | ||
| 5436 | |||
| 5437 | 结论: | ||
| 5438 | - 当前项目不再只有“随机切” | ||
| 5439 | - 已形成: | ||
| 5440 | - **训练侧**:`random / silence_aware / hybrid` | ||
| 5441 | - **建库侧**:固定滑窗 | ||
| 5442 | - **开源集 query 生成侧**:`random / sliding / silence_aware / hybrid` | ||
| 5443 | - 下一阶段可继续叠加 beat/onset/chorus-aware 切片,而无需推翻现有流程 | ... | ... |
| ... | @@ -345,6 +345,61 @@ flowchart TD | ... | @@ -345,6 +345,61 @@ flowchart TD |
| 345 | 345 | ||
| 346 | --- | 346 | --- |
| 347 | 347 | ||
| 348 | ## 11.5 切片策略:不要只用随机切 | ||
| 349 | |||
| 350 | 当前项目现在已经支持 4 类切片思路,但职责不同: | ||
| 351 | |||
| 352 | | 策略 | 适用位置 | 作用 | 是否已接入 | | ||
| 353 | |---|---|---|---| | ||
| 354 | | `random` | 训练 query | 增强泛化,模拟未知用户截取点 | 是 | | ||
| 355 | | `sliding` | 建库 / query 生成 | 保证覆盖率,减少漏召回 | 是 | | ||
| 356 | | `silence_aware` | 训练 query / 外部 query 生成 | 优先避开静音,落到真正有音乐内容的片段 | 是 | | ||
| 357 | | `hybrid` | 训练 query / 外部 query 生成 | 混合 silence-aware + random,兼顾稳定性与泛化 | 是 | | ||
| 358 | |||
| 359 | 推荐理解: | ||
| 360 | |||
| 361 | 1. **训练不是全部随机切** | ||
| 362 | 当前训练集可用 `random / silence_aware / hybrid` | ||
| 363 | 2. **reference 建库不是随机切** | ||
| 364 | 建库仍然是固定滑窗 | ||
| 365 | 3. **外部数据 query 生成也不是只能随机切** | ||
| 366 | 现在可选 `--query-strategy silence_aware` | ||
| 367 | |||
| 368 | 为什么不直接完全依赖音乐结构分段? | ||
| 369 | |||
| 370 | - ACR 真实 query 往往来自短视频、录屏、随手截取,不一定对齐节拍或段落边界 | ||
| 371 | - 先做 **静音感知分段**,收益最大、风险最小 | ||
| 372 | - 更复杂的 beat / chorus / onset 分段可以作为下一阶段增强,而不应替代现有随机增强 | ||
| 373 | |||
| 374 | ### 训练侧推荐 | ||
| 375 | |||
| 376 | ```bash | ||
| 377 | /usr/local/miniconda3/bin/python acr-engine/train.py \ | ||
| 378 | --data data/your_manifests \ | ||
| 379 | --segment-strategy hybrid \ | ||
| 380 | --silence-top-db 30 | ||
| 381 | ``` | ||
| 382 | |||
| 383 | 建议: | ||
| 384 | - baseline:`random` | ||
| 385 | - 更稳的音乐任务:`hybrid` | ||
| 386 | - 已知原始音频静音很多:`silence_aware` | ||
| 387 | |||
| 388 | ### 外部数据 query 生成推荐 | ||
| 389 | |||
| 390 | ```bash | ||
| 391 | /usr/local/miniconda3/bin/python acr-engine/src/data/external_adapters.py prepare-local fma data/raw/fma_small_audio \ | ||
| 392 | --output-root data/external_ingested \ | ||
| 393 | --query-duration 8 \ | ||
| 394 | --query-stride 4 \ | ||
| 395 | --query-strategy silence_aware \ | ||
| 396 | --silence-top-db 30 | ||
| 397 | ``` | ||
| 398 | |||
| 399 | 这会优先从非静音区生成 query,而不是从长静音头尾随机采样。 | ||
| 400 | |||
| 401 | --- | ||
| 402 | |||
| 348 | ## 12. 你这批内部素材 type,哪些推荐参与训练 | 403 | ## 12. 你这批内部素材 type,哪些推荐参与训练 |
| 349 | 404 | ||
| 350 | ## 12.1 一页结论 | 405 | ## 12.1 一页结论 | ... | ... |
-
Please register or sign in to post a comment