|
|
@@ -6,7 +6,7 @@ import cn.hutool.http.HttpResponse;
|
|
|
import cn.hutool.json.JSONArray;
|
|
|
import cn.hutool.json.JSONObject;
|
|
|
import cn.hutool.json.JSONUtil;
|
|
|
-import cn.iocoder.yudao.module.ai.service.websearch.vo.WebSearchRespVO;
|
|
|
+import cn.iocoder.yudao.module.ai.service.websearch.vo.AiWebSearchRespVO;
|
|
|
import lombok.extern.slf4j.Slf4j;
|
|
|
import org.springframework.beans.factory.annotation.Value;
|
|
|
import org.springframework.stereotype.Service;
|
|
|
@@ -46,11 +46,11 @@ public class WebSearchServiceImpl implements WebSearchService {
|
|
|
* @return 搜索结果列表
|
|
|
*/
|
|
|
@Override
|
|
|
- public List<WebSearchRespVO> bingSearch(String query, Integer count) {
|
|
|
+ public List<AiWebSearchRespVO> bingSearch(String query, Integer count) {
|
|
|
if (query == null || query.isEmpty()) {
|
|
|
return CollUtil.newArrayList();
|
|
|
}
|
|
|
-
|
|
|
+
|
|
|
try {
|
|
|
// 发送请求
|
|
|
HttpResponse response = HttpRequest.get(BING_URL)
|
|
|
@@ -60,41 +60,41 @@ public class WebSearchServiceImpl implements WebSearchService {
|
|
|
.form("responseFilter", "Webpages")
|
|
|
.form("textFormat", "Raw")
|
|
|
.execute();
|
|
|
-
|
|
|
+
|
|
|
// 解析响应
|
|
|
String body = response.body();
|
|
|
JSONObject json = JSONUtil.parseObj(body);
|
|
|
-
|
|
|
+
|
|
|
// 处理结果
|
|
|
- List<WebSearchRespVO> results = new ArrayList<>();
|
|
|
+ List<AiWebSearchRespVO> results = new ArrayList<>();
|
|
|
if (json.containsKey("webPages") && json.getJSONObject("webPages").containsKey("value")) {
|
|
|
JSONArray items = json.getJSONObject("webPages").getJSONArray("value");
|
|
|
for (int i = 0; i < items.size(); i++) {
|
|
|
JSONObject item = items.getJSONObject(i);
|
|
|
- WebSearchRespVO result = new WebSearchRespVO()
|
|
|
+ AiWebSearchRespVO result = new AiWebSearchRespVO()
|
|
|
.setTitle(item.getStr("name"))
|
|
|
.setUrl(item.getStr("url"))
|
|
|
.setSnippet(item.getStr("snippet"));
|
|
|
results.add(result);
|
|
|
}
|
|
|
}
|
|
|
-
|
|
|
+
|
|
|
return results;
|
|
|
} catch (Exception e) {
|
|
|
log.error("[bingSearch][查询({}) 发生异常]", query, e);
|
|
|
return CollUtil.newArrayList();
|
|
|
}
|
|
|
}
|
|
|
-
|
|
|
+
|
|
|
/**
|
|
|
- * Google 搜索(使用Serper API)
|
|
|
+ * Google 搜索(使用 Serper API)
|
|
|
*
|
|
|
* @param query 搜索关键词
|
|
|
* @param count 返回结果数量
|
|
|
* @return 搜索结果列表
|
|
|
*/
|
|
|
@Override
|
|
|
- public List<WebSearchRespVO> googleSearch(String query, Integer count) {
|
|
|
+ public List<AiWebSearchRespVO> googleSearch(String query, Integer count) {
|
|
|
if (query == null || query.isEmpty()) {
|
|
|
return CollUtil.newArrayList();
|
|
|
}
|
|
|
@@ -105,24 +105,24 @@ public class WebSearchServiceImpl implements WebSearchService {
|
|
|
payload.set("q", query);
|
|
|
payload.set("gl", "cn");
|
|
|
payload.set("num", count);
|
|
|
-
|
|
|
+
|
|
|
// 发送请求
|
|
|
HttpResponse response = HttpRequest.post(GOOGLE_URL)
|
|
|
.header("X-API-KEY", googleApiKey)
|
|
|
.header("Content-Type", "application/json")
|
|
|
.body(payload.toString())
|
|
|
.execute();
|
|
|
-
|
|
|
+
|
|
|
// 解析响应
|
|
|
String body = response.body();
|
|
|
JSONObject json = JSONUtil.parseObj(body);
|
|
|
JSONArray organicResults = json.getJSONArray("organic");
|
|
|
-
|
|
|
+
|
|
|
// 处理结果
|
|
|
- List<WebSearchRespVO> results = new ArrayList<>();
|
|
|
+ List<AiWebSearchRespVO> results = new ArrayList<>();
|
|
|
for (int i = 0; i < organicResults.size(); i++) {
|
|
|
JSONObject item = organicResults.getJSONObject(i);
|
|
|
- WebSearchRespVO result = new WebSearchRespVO()
|
|
|
+ AiWebSearchRespVO result = new AiWebSearchRespVO()
|
|
|
.setTitle(item.getStr("title"))
|
|
|
.setUrl(item.getStr("link"))
|
|
|
.setSnippet(item.containsKey("snippet") ? item.getStr("snippet") : "");
|
|
|
@@ -146,13 +146,13 @@ public class WebSearchServiceImpl implements WebSearchService {
|
|
|
if (CollUtil.isEmpty(urls)) {
|
|
|
return Map.of();
|
|
|
}
|
|
|
-
|
|
|
+
|
|
|
Map<String, String> result = new HashMap<>();
|
|
|
for (String url : urls) {
|
|
|
try {
|
|
|
// 解析URL以获取域名作为Origin
|
|
|
String origin = extractOrigin(url);
|
|
|
-
|
|
|
+
|
|
|
// 发送HTTP请求获取网页内容
|
|
|
HttpResponse response = HttpRequest.get(url)
|
|
|
.header("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36")
|
|
|
@@ -163,22 +163,22 @@ public class WebSearchServiceImpl implements WebSearchService {
|
|
|
.header("Cache-Control", "max-age=0")
|
|
|
.timeout(10000) // 设置10秒超时
|
|
|
.execute();
|
|
|
-
|
|
|
+
|
|
|
if (response.isOk()) {
|
|
|
String html = response.body();
|
|
|
-
|
|
|
+
|
|
|
// 使用Jsoup解析HTML并提取文本内容
|
|
|
org.jsoup.nodes.Document doc = org.jsoup.Jsoup.parse(html);
|
|
|
-
|
|
|
+
|
|
|
// 移除script和style元素,它们包含的内容不是我们需要的文本
|
|
|
doc.select("script, style, meta, link").remove();
|
|
|
-
|
|
|
+
|
|
|
// 获取body中的文本内容
|
|
|
String text = doc.body().text();
|
|
|
-
|
|
|
+
|
|
|
// 清理文本(移除多余空格)
|
|
|
text = text.replaceAll("\\s+", " ").trim();
|
|
|
-
|
|
|
+
|
|
|
result.put(url, text);
|
|
|
} else {
|
|
|
log.warn("[webCrawler][URL({}) 请求失败,状态码: {}]", url, response.getStatus());
|
|
|
@@ -189,20 +189,20 @@ public class WebSearchServiceImpl implements WebSearchService {
|
|
|
result.put(url, "");
|
|
|
}
|
|
|
}
|
|
|
-
|
|
|
+
|
|
|
return result;
|
|
|
}
|
|
|
-
|
|
|
+
|
|
|
/**
|
|
|
* 从URL中提取Origin
|
|
|
- *
|
|
|
+ *
|
|
|
* @param url 完整URL
|
|
|
* @return Origin (scheme://host[:port])
|
|
|
*/
|
|
|
private String extractOrigin(String url) {
|
|
|
try {
|
|
|
java.net.URL parsedUrl = new java.net.URL(url);
|
|
|
- return parsedUrl.getProtocol() + "://" + parsedUrl.getHost() +
|
|
|
+ return parsedUrl.getProtocol() + "://" + parsedUrl.getHost() +
|
|
|
(parsedUrl.getPort() == -1 ? "" : ":" + parsedUrl.getPort());
|
|
|
} catch (Exception e) {
|
|
|
log.warn("[extractOrigin][URL({}) 解析异常]", url, e);
|