8 hónapja · f638f90afc
--- a/yudao-module-ai/yudao-module-ai-biz/src/main/java/cn/iocoder/yudao/module/ai/service/websearch/WebSearchService.java
+++ b/yudao-module-ai/yudao-module-ai-biz/src/main/java/cn/iocoder/yudao/module/ai/service/websearch/WebSearchService.java
@@ -3,6 +3,7 @@ package cn.iocoder.yudao.module.ai.service.websearch;
 
				 import cn.iocoder.yudao.module.ai.service.websearch.vo.WebSearchRespVO;
			
 
				 
			
 
				 import java.util.List;
			
 
				+import java.util.Map;
			
 
				 
			
 
				 /**
			
 
				  * Web 搜索 Service 接口
			
@@ -26,4 +27,12 @@ public interface WebSearchService {
 
				      * @return 搜索结果列表
			
 
				      */
			
 
				     List<WebSearchRespVO> googleSearch(String query, Integer count);
			
 
				+
			
 
				+    /**
			
 
				+     * web 爬虫
			
 
				+     *
			
 
				+     * @param urls 爬虫地址
			
 
				+     * @return key: url value：爬虫内容
			
 
				+     */
			
 
				+    Map<String, String> webCrawler(List<String> urls);
			
 
				 }
			
--- a/yudao-module-ai/yudao-module-ai-biz/src/main/java/cn/iocoder/yudao/module/ai/service/websearch/WebSearchServiceImpl.java
+++ b/yudao-module-ai/yudao-module-ai-biz/src/main/java/cn/iocoder/yudao/module/ai/service/websearch/WebSearchServiceImpl.java
@@ -10,9 +10,12 @@ import cn.iocoder.yudao.module.ai.service.websearch.vo.WebSearchRespVO;
 
				 import lombok.extern.slf4j.Slf4j;
			
 
				 import org.springframework.beans.factory.annotation.Value;
			
 
				 import org.springframework.stereotype.Service;
			
 
				+import org.jsoup.Jsoup;
			
 
				 
			
 
				 import java.util.ArrayList;
			
 
				+import java.util.HashMap;
			
 
				 import java.util.List;
			
 
				+import java.util.Map;
			
 
				 
			
 
				 /**
			
 
				  * Bing Web 搜索实现类
			
@@ -136,4 +139,79 @@ public class WebSearchServiceImpl implements WebSearchService {
 
				             return CollUtil.newArrayList();
			
 
				         }
			
 
				     }
			
 
				+
			
 
				+    /**
			
 
				+     * web 爬虫
			
 
				+     *
			
 
				+     * @param urls 爬虫地址
			
 
				+     * @return key: url value：爬虫内容
			
 
				+     */
			
 
				+    @Override
			
 
				+    public Map<String, String> webCrawler(List<String> urls) {
			
 
				+        if (CollUtil.isEmpty(urls)) {
			
 
				+            return Map.of();
			
 
				+        }
			
 
				+        
			
 
				+        Map<String, String> result = new HashMap<>();
			
 
				+        for (String url : urls) {
			
 
				+            try {
			
 
				+                // 解析URL以获取域名作为Origin
			
 
				+                String origin = extractOrigin(url);
			
 
				+                
			
 
				+                // 发送HTTP请求获取网页内容
			
 
				+                HttpResponse response = HttpRequest.get(url)
			
 
				+                        .header("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36")
			
 
				+                        .header("Origin", origin)
			
 
				+                        .header("Referer", origin)
			
 
				+                        .header("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7")
			
 
				+                        .header("Accept-Language", "zh-CN,zh;q=0.9,en;q=0.8")
			
 
				+                        .header("Cache-Control", "max-age=0")
			
 
				+                        .timeout(10000) // 设置10秒超时
			
 
				+                        .execute();
			
 
				+                
			
 
				+                if (response.isOk()) {
			
 
				+                    String html = response.body();
			
 
				+                    
			
 
				+                    // 使用Jsoup解析HTML并提取文本内容
			
 
				+                    org.jsoup.nodes.Document doc = org.jsoup.Jsoup.parse(html);
			
 
				+                    
			
 
				+                    // 移除script和style元素，它们包含的内容不是我们需要的文本
			
 
				+                    doc.select("script, style, meta, link").remove();
			
 
				+                    
			
 
				+                    // 获取body中的文本内容
			
 
				+                    String text = doc.body().text();
			
 
				+                    
			
 
				+                    // 清理文本（移除多余空格）
			
 
				+                    text = text.replaceAll("\\s+", " ").trim();
			
 
				+                    
			
 
				+                    result.put(url, text);
			
 
				+                } else {
			
 
				+                    log.warn("[webCrawler][URL({}) 请求失败，状态码: {}]", url, response.getStatus());
			
 
				+                    result.put(url, "");
			
 
				+                }
			
 
				+            } catch (Exception e) {
			
 
				+                log.error("[webCrawler][URL({}) 爬取异常]", url, e);
			
 
				+                result.put(url, "");
			
 
				+            }
			
 
				+        }
			
 
				+        
			
 
				+        return result;
			
 
				+    }
			
 
				+    
			
 
				+    /**
			
 
				+     * 从URL中提取Origin
			
 
				+     * 
			
 
				+     * @param url 完整URL
			
 
				+     * @return Origin (scheme://host[:port])
			
 
				+     */
			
 
				+    private String extractOrigin(String url) {
			
 
				+        try {
			
 
				+            java.net.URL parsedUrl = new java.net.URL(url);
			
 
				+            return parsedUrl.getProtocol() + "://" + parsedUrl.getHost() + 
			
 
				+                   (parsedUrl.getPort() == -1 ? "" : ":" + parsedUrl.getPort());
			
 
				+        } catch (Exception e) {
			
 
				+            log.warn("[extractOrigin][URL({}) 解析异常]", url, e);
			
 
				+            return "";
			
 
				+        }
			
 
				+    }
			
 
				 }
			
--- a/yudao-module-ai/yudao-module-ai-biz/src/test/java/cn/iocoder/yudao/module/ai/service/WebSearchServiceTests.java
+++ b/yudao-module-ai/yudao-module-ai-biz/src/test/java/cn/iocoder/yudao/module/ai/service/WebSearchServiceTests.java
@@ -0,0 +1,21 @@
 
				+package cn.iocoder.yudao.module.ai.service;
			
 
				+
			
 
				+import cn.iocoder.yudao.module.ai.service.websearch.WebSearchServiceImpl;
			
 
				+import com.google.common.collect.Lists;
			
 
				+import org.junit.jupiter.api.Test;
			
 
				+
			
 
				+import java.util.Map;
			
 
				+
			
 
				+public class WebSearchServiceTests {
			
 
				+
			
 
				+    @Test
			
 
				+    public void webCrawlerTest() {
			
 
				+        WebSearchServiceImpl webSearchService = new WebSearchServiceImpl();
			
 
				+        Map<String, String> webCrawlerRes = webSearchService.webCrawler(
			
 
				+                Lists.newArrayList("https://tianqi.eastday.com/changsha/40/"));
			
 
				+
			
 
				+        for (Map.Entry<String, String> entry : webCrawlerRes.entrySet()) {
			
 
				+            System.err.println(entry.getValue());
			
 
				+        }
			
 
				+    }
			
 
				+}