diff --git a/mall-common/src/main/java/com/suisung/mall/common/utils/ProductTitleUtil.java b/mall-common/src/main/java/com/suisung/mall/common/utils/ProductTitleUtil.java index c5006271..d9ff780f 100644 --- a/mall-common/src/main/java/com/suisung/mall/common/utils/ProductTitleUtil.java +++ b/mall-common/src/main/java/com/suisung/mall/common/utils/ProductTitleUtil.java @@ -13,6 +13,7 @@ import org.apache.commons.text.similarity.LevenshteinDistance; import java.util.*; import java.util.regex.Pattern; +import java.util.stream.Collectors; @Slf4j public class ProductTitleUtil { @@ -42,6 +43,8 @@ public class ProductTitleUtil { private static final Pattern NUMERIC_PATTERN = Pattern.compile(".*\\d+.*"); private static final Pattern UNIT_PATTERN = Pattern.compile("(\\d+\\.?\\d*)([a-zA-Z]+)"); private static final Pattern TITLE_FILTER_PATTERN = Pattern.compile("[^a-zA-Z0-9\u4e00-\u9fa5]"); + private static final Pattern SYMBOL_PATTERN = Pattern.compile("[\\p{P}\\p{S}\\s]+"); // 符号和空格 + /** * 品牌词库(初始化后不可变) */ @@ -115,6 +118,36 @@ public class ProductTitleUtil { } } + /** + * 清洗电商商品标题,提炼核心商品名 + * + * @param title 原始商品标题 + * @return 清洗后的核心商品名 + */ + public static String cleanTitle2(String title) { + if (title == null || title.isEmpty()) { + return ""; + } + + // 1. 去除所有符号和空格 + String cleaned = SYMBOL_PATTERN.matcher(title).replaceAll(""); + + // 2. 按字符长度降序排列营销词,优先替换长词(避免部分匹配) + List sortedMarketingWords = STOP_WORDS.stream() + .sorted((a, b) -> b.length() - a.length()) + .collect(Collectors.toList()); + + // 3. 移除营销词(按长度降序处理,避免短词先匹配导致长词残留) + for (String word : sortedMarketingWords) { + cleaned = cleaned.replace(word, ""); + } + + // 4. 规格单位归一化(如 "500g" → "500克") + cleaned = normalizeUnit(cleaned); + + return cleaned; + } + /* ---------------------------- 私有方法 ---------------------------- */ /** @@ -245,5 +278,19 @@ public class ProductTitleUtil { System.out.println("标题1与标题4相似度:" + calculateSimilarity(title1, title4)); // 约100.0 System.out.println("标题5与标题6相似度:" + calculateSimilarity(title5, title6)); // 输出约90 System.out.println("标题7与标题8相似度:" + calculateSimilarity(title7, title8)); // 输出约45 + + + String[] testTitles = { + "【限时秒杀】三只松鼠开心果100g特价促销", + "华为Mate60 Pro 512G手机 官方旗舰店正品", + "Nike Air Max 运动鞋 男款 42码 新品热卖" + }; + + for (String title : testTitles) { + System.out.println("原始标题: " + title); + System.out.println("清洗后: " + cleanTitle(title)); + System.out.println("-------------------"); + } + } }