From 7746376c302e7ef1da9aba268a67baaa1d2e5175 Mon Sep 17 00:00:00 2001 From: tangwang Date: Thu, 5 Feb 2026 14:32:14 +0800 Subject: [PATCH] 日志统一用中文 --- data/wanbang/amazon_crawler.py | 64 ++++++++++++++++++++++++++++++++-------------------------------- data/wanbang/analyze_results.py | 60 ++++++++++++++++++++++++++++++------------------------------ docs/常用查询 - sql.sql | 3 +++ scripts/check_es_data.py | 56 ++++++++++++++++++++++++++++---------------------------- scripts/monitor_eviction.py | 32 ++++++++++++++++---------------- start_reranker.sh | 3 +++ 6 files changed, 112 insertions(+), 106 deletions(-) create mode 100644 start_reranker.sh diff --git a/data/wanbang/amazon_crawler.py b/data/wanbang/amazon_crawler.py index 5d1f285..99f6527 100755 --- a/data/wanbang/amazon_crawler.py +++ b/data/wanbang/amazon_crawler.py @@ -85,7 +85,7 @@ class AmazonCrawler: params[param] = kwargs[param] try: - logger.info(f"正在请求: {query}") + logger.info(f"Making request: {query}") self.total_requests += 1 response = requests.get( @@ -98,20 +98,20 @@ class AmazonCrawler: data = response.json() if data.get('error_code') == '0000': - logger.info(f"✓ 成功: {query} - 获得 {data.get('items', {}).get('real_total_results', 0)} 个结果") + logger.info(f"✓ Success: {query} - Got {data.get('items', {}).get('real_total_results', 0)} results") self.successful_requests += 1 return data else: - logger.error(f"✗ API错误: {query} - {data.get('reason', 'Unknown error')}") + logger.error(f"✗ API error: {query} - {data.get('reason', 'Unknown error')}") self.failed_requests += 1 return data except requests.exceptions.RequestException as e: - logger.error(f"✗ 请求失败: {query} - {str(e)}") + logger.error(f"✗ Request failed: {query} - {str(e)}") self.failed_requests += 1 return None except json.JSONDecodeError as e: - logger.error(f"✗ JSON解析失败: {query} - {str(e)}") + logger.error(f"✗ JSON parse failed: {query} - {str(e)}") self.failed_requests += 1 return None @@ -127,36 +127,36 @@ class AmazonCrawler: try: with open(filepath, 'w', encoding='utf-8') as f: json.dump(data, f, ensure_ascii=False, indent=2) - logger.debug(f"已保存: {filename}") + logger.debug(f"Saved: {filename}") except Exception as e: - logger.error(f"保存失败: {filename} - {str(e)}") + logger.error(f"Save failed: {filename} - {str(e)}") def crawl_from_file(self, queries_file: str, delay: float = 1.0, start_index: int = 0, max_queries: Optional[int] = None): """从文件读取查询列表并批量爬取""" self.start_time = datetime.now() logger.info("=" * 70) - logger.info(f"Amazon爬虫启动 - {self.start_time.strftime('%Y-%m-%d %H:%M:%S')}") + logger.info(f"Amazon crawler started - {self.start_time.strftime('%Y-%m-%d %H:%M:%S')}") logger.info("=" * 70) - logger.info(f"查询文件: {queries_file}") - logger.info(f"结果目录: {self.results_dir}") + logger.info(f"Queries file: {queries_file}") + logger.info(f"Results directory: {self.results_dir}") try: with open(queries_file, 'r', encoding='utf-8') as f: queries = [line.strip() for line in f if line.strip()] total_queries = len(queries) - logger.info(f"共读取 {total_queries} 个查询") + logger.info(f"Total queries read: {total_queries}") if start_index > 0: queries = queries[start_index:] - logger.info(f"从索引 {start_index} 开始") + logger.info(f"Starting from index {start_index}") if max_queries: queries = queries[:max_queries] - logger.info(f"限制爬取数量: {max_queries}") + logger.info(f"Limit crawl count to: {max_queries}") - logger.info(f"请求间隔: {delay} 秒") + logger.info(f"Request interval: {delay} seconds") logger.info("=" * 70) # 逐个爬取 @@ -189,24 +189,24 @@ class AmazonCrawler: duration = end_time - self.start_time logger.info("=" * 70) - logger.info("爬取完成!") + logger.info("Crawling completed!") logger.info("=" * 70) - logger.info(f"开始时间: {self.start_time.strftime('%Y-%m-%d %H:%M:%S')}") - logger.info(f"结束时间: {end_time.strftime('%Y-%m-%d %H:%M:%S')}") - logger.info(f"总耗时: {duration}") - logger.info(f"总请求数: {self.total_requests}") - logger.info(f"成功: {self.successful_requests} ({self.successful_requests/self.total_requests*100:.1f}%)") - logger.info(f"失败: {self.failed_requests} ({self.failed_requests/self.total_requests*100:.1f}%)") - logger.info(f"结果保存在: {self.results_dir.absolute()}") + logger.info(f"Start time: {self.start_time.strftime('%Y-%m-%d %H:%M:%S')}") + logger.info(f"End time: {end_time.strftime('%Y-%m-%d %H:%M:%S')}") + logger.info(f"Total duration: {duration}") + logger.info(f"Total requests: {self.total_requests}") + logger.info(f"Successful: {self.successful_requests} ({self.successful_requests/self.total_requests*100:.1f}%)") + logger.info(f"Failed: {self.failed_requests} ({self.failed_requests/self.total_requests*100:.1f}%)") + logger.info(f"Results saved to: {self.results_dir.absolute()}") logger.info("=" * 70) except FileNotFoundError: - logger.error(f"文件不存在: {queries_file}") + logger.error(f"File not found: {queries_file}") except KeyboardInterrupt: - logger.warning("\n用户中断爬取") - logger.info(f"已完成: {self.successful_requests}/{self.total_requests}") + logger.warning("\nUser interrupted crawling") + logger.info(f"Completed: {self.successful_requests}/{self.total_requests}") except Exception as e: - logger.error(f"爬取过程出错: {str(e)}", exc_info=True) + logger.error(f"Error during crawling: {str(e)}", exc_info=True) def load_config(): @@ -216,7 +216,7 @@ def load_config(): import config return config except ImportError: - logger.warning("未找到配置文件 config.py,使用默认配置") + logger.warning("Config file config.py not found, using default configuration") return None @@ -258,12 +258,12 @@ def main(): if not api_key or not api_secret or \ api_key == "your_api_key_here" or api_secret == "your_api_secret_here": logger.error("=" * 70) - logger.error("错误: 未配置API密钥!") + logger.error("Error: API key not configured!") logger.error("") - logger.error("请使用以下任一方式配置API密钥:") - logger.error("1. 命令行参数: --key YOUR_KEY --secret YOUR_SECRET") - logger.error("2. 配置文件: 复制 config.example.py 为 config.py 并填入密钥") - logger.error("3. 环境变量: ONEBOUND_API_KEY 和 ONEBOUND_API_SECRET") + logger.error("Please configure API key using one of the following methods:") + logger.error("1. Command line arguments: --key YOUR_KEY --secret YOUR_SECRET") + logger.error("2. Config file: Copy config.example.py to config.py and fill in the keys") + logger.error("3. Environment variables: ONEBOUND_API_KEY and ONEBOUND_API_SECRET") logger.error("=" * 70) return diff --git a/data/wanbang/analyze_results.py b/data/wanbang/analyze_results.py index 9f8b321..a1c0709 100755 --- a/data/wanbang/analyze_results.py +++ b/data/wanbang/analyze_results.py @@ -30,22 +30,22 @@ class ResultAnalyzer: """ self.results_dir = Path(results_dir) if not self.results_dir.exists(): - logger.error(f"结果目录不存在: {self.results_dir}") + logger.error(f"Results directory does not exist: {self.results_dir}") raise FileNotFoundError(f"Directory not found: {self.results_dir}") def analyze(self): """执行完整分析""" logger.info("=" * 70) - logger.info("Amazon爬取结果分析") + logger.info("Amazon crawling result analysis") logger.info("=" * 70) - logger.info(f"结果目录: {self.results_dir.absolute()}") + logger.info(f"Results directory: {self.results_dir.absolute()}") # 获取所有JSON文件 json_files = list(self.results_dir.glob("*.json")) - logger.info(f"JSON文件数量: {len(json_files)}") + logger.info(f"Number of JSON files: {len(json_files)}") if not json_files: - logger.warning("未找到任何JSON文件") + logger.warning("No JSON files found") return # 统计数据 @@ -62,7 +62,7 @@ class ResultAnalyzer: } # 分析每个文件 - logger.info("\n正在分析文件...") + logger.info("\nAnalyzing files...") for json_file in json_files: try: with open(json_file, 'r', encoding='utf-8') as f: @@ -118,7 +118,7 @@ class ResultAnalyzer: stats['failed'] += 1 except Exception as e: - logger.error(f"分析文件失败 {json_file.name}: {str(e)}") + logger.error(f"Failed to analyze file {json_file.name}: {str(e)}") stats['failed'] += 1 # 输出统计结果 @@ -130,29 +130,29 @@ class ResultAnalyzer: def print_stats(self, stats: Dict): """打印统计信息""" logger.info("\n" + "=" * 70) - logger.info("统计结果") + logger.info("Statistics results") logger.info("=" * 70) # 基本统计 - logger.info(f"\n【文件统计】") - logger.info(f"总文件数: {stats['total_files']}") - logger.info(f"成功: {stats['successful']} ({stats['successful']/stats['total_files']*100:.1f}%)") - logger.info(f"失败: {stats['failed']} ({stats['failed']/stats['total_files']*100:.1f}%)") - + logger.info(f"\n[File Statistics]") + logger.info(f"Total files: {stats['total_files']}") + logger.info(f"Successful: {stats['successful']} ({stats['successful']/stats['total_files']*100:.1f}%)") + logger.info(f"Failed: {stats['failed']} ({stats['failed']/stats['total_files']*100:.1f}%)") + # 商品统计 - logger.info(f"\n【商品统计】") - logger.info(f"总商品数: {stats['total_items']}") + logger.info(f"\n[Product Statistics]") + logger.info(f"Total products: {stats['total_items']}") if stats['items_per_query']: avg_items = sum(stats['items_per_query']) / len(stats['items_per_query']) max_items = max(stats['items_per_query']) min_items = min(stats['items_per_query']) - logger.info(f"平均每个查询: {avg_items:.1f} 个商品") - logger.info(f"最多: {max_items} 个") - logger.info(f"最少: {min_items} 个") + logger.info(f"Average per query: {avg_items:.1f} products") + logger.info(f"Maximum: {max_items} products") + logger.info(f"Minimum: {min_items} products") # 价格分布 if stats['price_ranges']: - logger.info(f"\n【价格分布】") + logger.info(f"\n[Price Distribution]") total_priced = sum(stats['price_ranges'].values()) for price_range, count in sorted(stats['price_ranges'].items()): percentage = count / total_priced * 100 @@ -162,15 +162,15 @@ class ResultAnalyzer: if stats['avg_reviews']: avg_reviews = sum(stats['avg_reviews']) / len(stats['avg_reviews']) max_reviews = max(stats['avg_reviews']) - logger.info(f"\n【评论统计】") - logger.info(f"平均评论数: {avg_reviews:.0f}") - logger.info(f"最高评论数: {max_reviews}") + logger.info(f"\n[Review Statistics]") + logger.info(f"Average reviews: {avg_reviews:.0f}") + logger.info(f"Highest reviews: {max_reviews}") # 评分统计 if stats['avg_stars']: avg_stars = sum(stats['avg_stars']) / len(stats['avg_stars']) - logger.info(f"\n【评分统计】") - logger.info(f"平均评分: {avg_stars:.2f}") + logger.info(f"\n[Rating Statistics]") + logger.info(f"Average rating: {avg_stars:.2f}") logger.info("\n" + "=" * 70) @@ -204,9 +204,9 @@ class ResultAnalyzer: try: with open(report_file, 'w', encoding='utf-8') as f: json.dump(report, f, ensure_ascii=False, indent=2) - logger.info(f"分析报告已保存: {report_file}") + logger.info(f"Analysis report saved: {report_file}") except Exception as e: - logger.error(f"保存报告失败: {str(e)}") + logger.error(f"Failed to save report: {str(e)}") def export_csv(self, output_file: str = None): """导出为CSV格式""" @@ -215,7 +215,7 @@ class ResultAnalyzer: if output_file is None: output_file = self.results_dir / "items_export.csv" - logger.info(f"\n导出CSV: {output_file}") + logger.info(f"\nExporting to CSV: {output_file}") json_files = list(self.results_dir.glob("*.json")) @@ -243,9 +243,9 @@ class ResultAnalyzer: item.get('detail_url', '') ]) except Exception as e: - logger.error(f"导出失败 {json_file.name}: {str(e)}") + logger.error(f"Export failed for {json_file.name}: {str(e)}") - logger.info(f"CSV导出完成: {output_file}") + logger.info(f"CSV export completed: {output_file}") def main(): @@ -270,7 +270,7 @@ def main(): analyzer.export_csv(args.output) except Exception as e: - logger.error(f"分析失败: {str(e)}") + logger.error(f"Analysis failed: {str(e)}") if __name__ == "__main__": diff --git a/docs/常用查询 - sql.sql b/docs/常用查询 - sql.sql index d491213..12aa787 100644 --- a/docs/常用查询 - sql.sql +++ b/docs/常用查询 - sql.sql @@ -1,6 +1,9 @@ -- 查询今天入库的SPU和SKU商品数据 -- 用于查询当天新增的商品信息 + +select id, title from shoplazza_product_spu where tenant_id = 170 and deleted = 0 ; + -- ====================================== -- 1. 查询今天入库的SPU商品 -- ====================================== diff --git a/scripts/check_es_data.py b/scripts/check_es_data.py index bc3e753..33da512 100755 --- a/scripts/check_es_data.py +++ b/scripts/check_es_data.py @@ -1,6 +1,6 @@ #!/usr/bin/env python3 """ -检查ES索引中的实际数据,看分面字段是否有值 +Check actual data in ES index to see if facet fields have values """ import sys @@ -14,9 +14,9 @@ from utils.es_client import ESClient def check_es_facet_fields(es_client, tenant_id: str, size: int = 5): - """检查ES中的分面相关字段""" + """Check facet-related fields in ES""" print("\n" + "="*60) - print("检查ES索引中的分面字段数据") + print("Checking facet field data in ES index") print("="*60) query = { @@ -46,14 +46,14 @@ def check_es_facet_fields(es_client, tenant_id: str, size: int = 5): hits = response.get('hits', {}).get('hits', []) total = response.get('hits', {}).get('total', {}).get('value', 0) - print(f"\n总文档数: {total}") - print(f"检查前 {len(hits)} 个文档:\n") + print(f"\nTotal documents: {total}") + print(f"Checking first {len(hits)} documents:\n") for i, hit in enumerate(hits, 1): source = hit.get('_source', {}) title_obj = source.get("title") or {} category_path_obj = source.get("category_path") or {} - print(f"文档 {i}:") + print(f"Document {i}:") print(f" spu_id: {source.get('spu_id')}") print(f" title.zh: {str(title_obj.get('zh', ''))[:50] if isinstance(title_obj, dict) else ''}") print(f" category1_name: {source.get('category1_name')}") @@ -67,24 +67,24 @@ def check_es_facet_fields(es_client, tenant_id: str, size: int = 5): specs = source.get('specifications', []) if specs: - print(f" specifications 数量: {len(specs)}") + print(f" specifications count: {len(specs)}") # 显示前3个specifications for spec in specs[:3]: print(f" - name: {spec.get('name')}, value: {spec.get('value')}") else: - print(f" specifications: 空") + print(f" specifications: empty") print() except Exception as e: - print(f"错误: {e}") + print(f"Error: {e}") import traceback traceback.print_exc() def check_facet_aggregations(es_client, tenant_id: str): - """检查分面聚合查询""" + """Check facet aggregation queries""" print("\n" + "="*60) - print("检查分面聚合查询结果") + print("Checking facet aggregation query results") print("="*60) query = { @@ -174,16 +174,16 @@ def check_facet_aggregations(es_client, tenant_id: str): response = es_client.client.search(index="search_products", body=query) aggs = response.get('aggregations', {}) - print("\n1. category1_name 分面:") + print("\n1. category1_name facet:") category1 = aggs.get('category1_facet', {}) buckets = category1.get('buckets', []) if buckets: for bucket in buckets: print(f" {bucket['key']}: {bucket['doc_count']}") else: - print(" 空(没有数据)") + print(" empty (no data)") - print("\n2. specifications.color 分面:") + print("\n2. specifications.color facet:") color_agg = aggs.get('color_facet', {}) color_filter = color_agg.get('filter_by_name', {}) color_values = color_filter.get('value_counts', {}) @@ -192,9 +192,9 @@ def check_facet_aggregations(es_client, tenant_id: str): for bucket in color_buckets: print(f" {bucket['key']}: {bucket['doc_count']}") else: - print(" 空(没有数据)") + print(" empty (no data)") - print("\n3. specifications.size 分面:") + print("\n3. specifications.size facet:") size_agg = aggs.get('size_facet', {}) size_filter = size_agg.get('filter_by_name', {}) size_values = size_filter.get('value_counts', {}) @@ -203,9 +203,9 @@ def check_facet_aggregations(es_client, tenant_id: str): for bucket in size_buckets: print(f" {bucket['key']}: {bucket['doc_count']}") else: - print(" 空(没有数据)") + print(" empty (no data)") - print("\n4. specifications.material 分面:") + print("\n4. specifications.material facet:") material_agg = aggs.get('material_facet', {}) material_filter = material_agg.get('filter_by_name', {}) material_values = material_filter.get('value_counts', {}) @@ -214,19 +214,19 @@ def check_facet_aggregations(es_client, tenant_id: str): for bucket in material_buckets: print(f" {bucket['key']}: {bucket['doc_count']}") else: - print(" 空(没有数据)") + print(" empty (no data)") except Exception as e: - print(f"错误: {e}") + print(f"Error: {e}") import traceback traceback.print_exc() def main(): - parser = argparse.ArgumentParser(description='检查ES索引中的分面字段数据') + parser = argparse.ArgumentParser(description='Check facet field data in ES index') parser.add_argument('--tenant-id', required=True, help='Tenant ID') - parser.add_argument('--es-host', help='Elasticsearch host (或使用环境变量 ES_HOST, 默认: http://localhost:9200)') - parser.add_argument('--size', type=int, default=5, help='检查的文档数量 (默认: 5)') + parser.add_argument('--es-host', help='Elasticsearch host (or use env var ES_HOST, default: http://localhost:9200)') + parser.add_argument('--size', type=int, default=5, help='Number of documents to check (default: 5)') args = parser.parse_args() @@ -235,7 +235,7 @@ def main(): es_username = os.environ.get('ES_USERNAME') es_password = os.environ.get('ES_PASSWORD') - print(f"连接Elasticsearch: {es_host}") + print(f"Connecting to Elasticsearch: {es_host}") print(f"Tenant ID: {args.tenant_id}\n") try: @@ -245,11 +245,11 @@ def main(): es_client = ESClient(hosts=[es_host]) if not es_client.ping(): - print(f"✗ 无法连接到Elasticsearch: {es_host}") + print(f"✗ Cannot connect to Elasticsearch: {es_host}") return 1 - print("✓ Elasticsearch连接成功\n") + print("✓ Elasticsearch connected successfully\n") except Exception as e: - print(f"✗ 连接Elasticsearch失败: {e}") + print(f"✗ Failed to connect to Elasticsearch: {e}") return 1 # 检查ES数据 @@ -257,7 +257,7 @@ def main(): check_facet_aggregations(es_client, args.tenant_id) print("\n" + "="*60) - print("检查完成") + print("Check completed") print("="*60) return 0 diff --git a/scripts/monitor_eviction.py b/scripts/monitor_eviction.py index 26d87bd..37be401 100755 --- a/scripts/monitor_eviction.py +++ b/scripts/monitor_eviction.py @@ -1,8 +1,8 @@ #!/usr/bin/env python3 """ -实时监控 Redis 缓存淘汰事件 +Real-time monitoring of Redis cache eviction events -持续监控 evicted_keys 统计,当有新的淘汰发生时发出警告 +Continuously monitor evicted_keys statistics and warn when new evictions occur """ import redis @@ -18,7 +18,7 @@ sys.path.insert(0, str(project_root)) from config.env_config import REDIS_CONFIG def get_redis_client(): - """获取 Redis 客户端""" + """Get Redis client""" return redis.Redis( host=REDIS_CONFIG.get('host', 'localhost'), port=REDIS_CONFIG.get('port', 6479), @@ -29,12 +29,12 @@ def get_redis_client(): ) def monitor_eviction(interval=5): - """持续监控淘汰事件""" + """Continuously monitor eviction events""" print("=" * 60) - print("Redis 缓存淘汰实时监控") + print("Redis Cache Eviction Real-time Monitoring") print("=" * 60) - print(f"监控间隔: {interval} 秒") - print("按 Ctrl+C 停止监控") + print(f"Monitoring interval: {interval} seconds") + print("Press Ctrl+C to stop monitoring") print("=" * 60) print() @@ -42,7 +42,7 @@ def monitor_eviction(interval=5): client = get_redis_client() client.ping() except Exception as e: - print(f"❌ Redis 连接失败: {e}") + print(f"❌ Redis connection failed: {e}") return last_evicted = 0 @@ -55,8 +55,8 @@ def monitor_eviction(interval=5): if current_evicted > last_evicted: new_evictions = current_evicted - last_evicted timestamp = datetime.now().strftime('%Y-%m-%d %H:%M:%S') - print(f"[{timestamp}] ⚠️ 检测到 {new_evictions} 个新的淘汰事件!") - print(f" 累计淘汰总数: {current_evicted:,}") + print(f"[{timestamp}] ⚠️ Detected {new_evictions} new eviction events!") + print(f" Total evictions: {current_evicted:,}") # 检查内存使用情况 mem_info = client.info('memory') @@ -64,26 +64,26 @@ def monitor_eviction(interval=5): used_memory = mem_info.get('used_memory', 0) if maxmemory > 0: usage_percent = (used_memory / maxmemory) * 100 - print(f" 当前内存使用率: {usage_percent:.2f}%") + print(f" Current memory usage: {usage_percent:.2f}%") last_evicted = current_evicted else: timestamp = datetime.now().strftime('%Y-%m-%d %H:%M:%S') - print(f"[{timestamp}] ✅ 无新淘汰事件 (累计: {current_evicted:,})") + print(f"[{timestamp}] ✅ No new eviction events (Total: {current_evicted:,})") time.sleep(interval) except KeyboardInterrupt: - print("\n\n监控已停止") + print("\n\nMonitoring stopped") except Exception as e: - print(f"\n❌ 监控出错: {e}") + print(f"\n❌ Monitoring error: {e}") import traceback traceback.print_exc() if __name__ == "__main__": import argparse - parser = argparse.ArgumentParser(description='实时监控 Redis 缓存淘汰事件') - parser.add_argument('--interval', type=int, default=5, help='监控间隔(秒),默认 5 秒') + parser = argparse.ArgumentParser(description='Real-time monitoring of Redis cache eviction events') + parser.add_argument('--interval', type=int, default=5, help='Monitoring interval in seconds (default: 5)') args = parser.parse_args() monitor_eviction(interval=args.interval) diff --git a/start_reranker.sh b/start_reranker.sh new file mode 100644 index 0000000..ae86205 --- /dev/null +++ b/start_reranker.sh @@ -0,0 +1,3 @@ + +uvicorn reranker.server:app --host 0.0.0.0 --port 6007 + -- libgit2 0.21.2