增加 商品名称标题匹配工具类,性能和计算均衡版,还可以继续优化。

This commit is contained in:
Jack 2025-07-09 23:55:25 +08:00
parent 1c9e6644d4
commit 943eb54037

View File

@ -13,6 +13,7 @@ import org.apache.commons.text.similarity.LevenshteinDistance;
import java.util.*; import java.util.*;
import java.util.regex.Pattern; import java.util.regex.Pattern;
import java.util.stream.Collectors;
@Slf4j @Slf4j
public class ProductTitleUtil { public class ProductTitleUtil {
@ -42,6 +43,8 @@ public class ProductTitleUtil {
private static final Pattern NUMERIC_PATTERN = Pattern.compile(".*\\d+.*"); private static final Pattern NUMERIC_PATTERN = Pattern.compile(".*\\d+.*");
private static final Pattern UNIT_PATTERN = Pattern.compile("(\\d+\\.?\\d*)([a-zA-Z]+)"); private static final Pattern UNIT_PATTERN = Pattern.compile("(\\d+\\.?\\d*)([a-zA-Z]+)");
private static final Pattern TITLE_FILTER_PATTERN = Pattern.compile("[^a-zA-Z0-9\u4e00-\u9fa5]"); private static final Pattern TITLE_FILTER_PATTERN = Pattern.compile("[^a-zA-Z0-9\u4e00-\u9fa5]");
private static final Pattern SYMBOL_PATTERN = Pattern.compile("[\\p{P}\\p{S}\\s]+"); // 符号和空格
/** /**
* 品牌词库初始化后不可变 * 品牌词库初始化后不可变
*/ */
@ -115,6 +118,36 @@ public class ProductTitleUtil {
} }
} }
/**
* 清洗电商商品标题提炼核心商品名
*
* @param title 原始商品标题
* @return 清洗后的核心商品名
*/
public static String cleanTitle2(String title) {
if (title == null || title.isEmpty()) {
return "";
}
// 1. 去除所有符号和空格
String cleaned = SYMBOL_PATTERN.matcher(title).replaceAll("");
// 2. 按字符长度降序排列营销词优先替换长词避免部分匹配
List<String> sortedMarketingWords = STOP_WORDS.stream()
.sorted((a, b) -> b.length() - a.length())
.collect(Collectors.toList());
// 3. 移除营销词按长度降序处理避免短词先匹配导致长词残留
for (String word : sortedMarketingWords) {
cleaned = cleaned.replace(word, "");
}
// 4. 规格单位归一化 "500g" "500克"
cleaned = normalizeUnit(cleaned);
return cleaned;
}
/* ---------------------------- 私有方法 ---------------------------- */ /* ---------------------------- 私有方法 ---------------------------- */
/** /**
@ -245,5 +278,19 @@ public class ProductTitleUtil {
System.out.println("标题1与标题4相似度" + calculateSimilarity(title1, title4)); // 约100.0 System.out.println("标题1与标题4相似度" + calculateSimilarity(title1, title4)); // 约100.0
System.out.println("标题5与标题6相似度" + calculateSimilarity(title5, title6)); // 输出约90 System.out.println("标题5与标题6相似度" + calculateSimilarity(title5, title6)); // 输出约90
System.out.println("标题7与标题8相似度" + calculateSimilarity(title7, title8)); // 输出约45 System.out.println("标题7与标题8相似度" + calculateSimilarity(title7, title8)); // 输出约45
String[] testTitles = {
"【限时秒杀】三只松鼠开心果100g特价促销",
"华为Mate60 Pro 512G手机 官方旗舰店正品",
"Nike Air Max 运动鞋 男款 42码 新品热卖"
};
for (String title : testTitles) {
System.out.println("原始标题: " + title);
System.out.println("清洗后: " + cleanTitle(title));
System.out.println("-------------------");
}
} }
} }