增加 商品名称标题匹配工具类,性能和计算均衡版,还可以继续优化。
This commit is contained in:
parent
1c9e6644d4
commit
943eb54037
@ -13,6 +13,7 @@ import org.apache.commons.text.similarity.LevenshteinDistance;
|
||||
|
||||
import java.util.*;
|
||||
import java.util.regex.Pattern;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
@Slf4j
|
||||
public class ProductTitleUtil {
|
||||
@ -42,6 +43,8 @@ public class ProductTitleUtil {
|
||||
private static final Pattern NUMERIC_PATTERN = Pattern.compile(".*\\d+.*");
|
||||
private static final Pattern UNIT_PATTERN = Pattern.compile("(\\d+\\.?\\d*)([a-zA-Z]+)");
|
||||
private static final Pattern TITLE_FILTER_PATTERN = Pattern.compile("[^a-zA-Z0-9\u4e00-\u9fa5]");
|
||||
private static final Pattern SYMBOL_PATTERN = Pattern.compile("[\\p{P}\\p{S}\\s]+"); // 符号和空格
|
||||
|
||||
/**
|
||||
* 品牌词库(初始化后不可变)
|
||||
*/
|
||||
@ -115,6 +118,36 @@ public class ProductTitleUtil {
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* 清洗电商商品标题,提炼核心商品名
|
||||
*
|
||||
* @param title 原始商品标题
|
||||
* @return 清洗后的核心商品名
|
||||
*/
|
||||
public static String cleanTitle2(String title) {
|
||||
if (title == null || title.isEmpty()) {
|
||||
return "";
|
||||
}
|
||||
|
||||
// 1. 去除所有符号和空格
|
||||
String cleaned = SYMBOL_PATTERN.matcher(title).replaceAll("");
|
||||
|
||||
// 2. 按字符长度降序排列营销词,优先替换长词(避免部分匹配)
|
||||
List<String> sortedMarketingWords = STOP_WORDS.stream()
|
||||
.sorted((a, b) -> b.length() - a.length())
|
||||
.collect(Collectors.toList());
|
||||
|
||||
// 3. 移除营销词(按长度降序处理,避免短词先匹配导致长词残留)
|
||||
for (String word : sortedMarketingWords) {
|
||||
cleaned = cleaned.replace(word, "");
|
||||
}
|
||||
|
||||
// 4. 规格单位归一化(如 "500g" → "500克")
|
||||
cleaned = normalizeUnit(cleaned);
|
||||
|
||||
return cleaned;
|
||||
}
|
||||
|
||||
/* ---------------------------- 私有方法 ---------------------------- */
|
||||
|
||||
/**
|
||||
@ -245,5 +278,19 @@ public class ProductTitleUtil {
|
||||
System.out.println("标题1与标题4相似度:" + calculateSimilarity(title1, title4)); // 约100.0
|
||||
System.out.println("标题5与标题6相似度:" + calculateSimilarity(title5, title6)); // 输出约90
|
||||
System.out.println("标题7与标题8相似度:" + calculateSimilarity(title7, title8)); // 输出约45
|
||||
|
||||
|
||||
String[] testTitles = {
|
||||
"【限时秒杀】三只松鼠开心果100g特价促销",
|
||||
"华为Mate60 Pro 512G手机 官方旗舰店正品",
|
||||
"Nike Air Max 运动鞋 男款 42码 新品热卖"
|
||||
};
|
||||
|
||||
for (String title : testTitles) {
|
||||
System.out.println("原始标题: " + title);
|
||||
System.out.println("清洗后: " + cleanTitle(title));
|
||||
System.out.println("-------------------");
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
Loading…
Reference in New Issue
Block a user