|
|
@@ -10,9 +10,12 @@ import cn.iocoder.yudao.module.ai.service.websearch.vo.WebSearchRespVO;
|
|
|
import lombok.extern.slf4j.Slf4j;
|
|
|
import org.springframework.beans.factory.annotation.Value;
|
|
|
import org.springframework.stereotype.Service;
|
|
|
+import org.jsoup.Jsoup;
|
|
|
|
|
|
import java.util.ArrayList;
|
|
|
+import java.util.HashMap;
|
|
|
import java.util.List;
|
|
|
+import java.util.Map;
|
|
|
|
|
|
/**
|
|
|
* Bing Web 搜索实现类
|
|
|
@@ -136,4 +139,79 @@ public class WebSearchServiceImpl implements WebSearchService {
|
|
|
return CollUtil.newArrayList();
|
|
|
}
|
|
|
}
|
|
|
+
|
|
|
+ /**
|
|
|
+ * web 爬虫
|
|
|
+ *
|
|
|
+ * @param urls 爬虫地址
|
|
|
+ * @return key: url value:爬虫内容
|
|
|
+ */
|
|
|
+ @Override
|
|
|
+ public Map<String, String> webCrawler(List<String> urls) {
|
|
|
+ if (CollUtil.isEmpty(urls)) {
|
|
|
+ return Map.of();
|
|
|
+ }
|
|
|
+
|
|
|
+ Map<String, String> result = new HashMap<>();
|
|
|
+ for (String url : urls) {
|
|
|
+ try {
|
|
|
+ // 解析URL以获取域名作为Origin
|
|
|
+ String origin = extractOrigin(url);
|
|
|
+
|
|
|
+ // 发送HTTP请求获取网页内容
|
|
|
+ HttpResponse response = HttpRequest.get(url)
|
|
|
+ .header("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36")
|
|
|
+ .header("Origin", origin)
|
|
|
+ .header("Referer", origin)
|
|
|
+ .header("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7")
|
|
|
+ .header("Accept-Language", "zh-CN,zh;q=0.9,en;q=0.8")
|
|
|
+ .header("Cache-Control", "max-age=0")
|
|
|
+ .timeout(10000) // 设置10秒超时
|
|
|
+ .execute();
|
|
|
+
|
|
|
+ if (response.isOk()) {
|
|
|
+ String html = response.body();
|
|
|
+
|
|
|
+ // 使用Jsoup解析HTML并提取文本内容
|
|
|
+ org.jsoup.nodes.Document doc = org.jsoup.Jsoup.parse(html);
|
|
|
+
|
|
|
+ // 移除script和style元素,它们包含的内容不是我们需要的文本
|
|
|
+ doc.select("script, style, meta, link").remove();
|
|
|
+
|
|
|
+ // 获取body中的文本内容
|
|
|
+ String text = doc.body().text();
|
|
|
+
|
|
|
+ // 清理文本(移除多余空格)
|
|
|
+ text = text.replaceAll("\\s+", " ").trim();
|
|
|
+
|
|
|
+ result.put(url, text);
|
|
|
+ } else {
|
|
|
+ log.warn("[webCrawler][URL({}) 请求失败,状态码: {}]", url, response.getStatus());
|
|
|
+ result.put(url, "");
|
|
|
+ }
|
|
|
+ } catch (Exception e) {
|
|
|
+ log.error("[webCrawler][URL({}) 爬取异常]", url, e);
|
|
|
+ result.put(url, "");
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ return result;
|
|
|
+ }
|
|
|
+
|
|
|
+ /**
|
|
|
+ * 从URL中提取Origin
|
|
|
+ *
|
|
|
+ * @param url 完整URL
|
|
|
+ * @return Origin (scheme://host[:port])
|
|
|
+ */
|
|
|
+ private String extractOrigin(String url) {
|
|
|
+ try {
|
|
|
+ java.net.URL parsedUrl = new java.net.URL(url);
|
|
|
+ return parsedUrl.getProtocol() + "://" + parsedUrl.getHost() +
|
|
|
+ (parsedUrl.getPort() == -1 ? "" : ":" + parsedUrl.getPort());
|
|
|
+ } catch (Exception e) {
|
|
|
+ log.warn("[extractOrigin][URL({}) 解析异常]", url, e);
|
|
|
+ return "";
|
|
|
+ }
|
|
|
+ }
|
|
|
}
|